In [ ]:
import sys
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import OneSidedSelection, TomekLinks
from collections import Counter
import random
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from imblearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, Isomap, LocallyLinearEmbedding, TSNE
from adjustText import adjust_text
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import DBSCAN
from xgboost import XGBClassifier
import folium

# Utility function for graphing
def plot_cols(dataframe, data_cols, subplot_columns, graph_type, **kwargs):
    
    # If input for subplot_columns is not valid pick the best option
    gcd = np.gcd(len(data_cols), subplot_columns)
    if gcd == 1:
        num_columns = subplot_columns
    else:
        num_columns = gcd
   

    # Assign columns to a subplot
    plot_type = getattr(sns, graph_type)
    matrix = np.array(data_cols).reshape(-1, num_columns)
    rows, columns = matrix.shape

    # Dynamically change figsize based on # of rows/columns
    height_per_row = 5  # You can adjust this value depending on how much space each plot needs
    width_per_column = 5  # You can adjust this value for plot width
    figsize = (width_per_column * columns, height_per_row * rows)
    fig, axs = plt.subplots(rows, columns, figsize=figsize)
   
    for i, ax in enumerate(axs.flat):
        if i < len(data_cols):
            col = data_cols[i]
            plot_type(data=dataframe, x=col, ax=ax, **kwargs)
            skewness = np.mean((dataframe[col] - np.mean(dataframe[col])) **3 ) / np.std(dataframe[col])**3
            if skewness > 1:
                ax.set_xscale('log')
            ax.set_title(col)
        else:
            ax.axis('off')
    plt.tight_layout(pad=5)
    plt.show()

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_columns', 300)

Data Preprocessing¶

In [2]:
file_path = 'C:/Users/rdn91/OneDrive/Desktop/Coding/Python_Projects/california_counties/data/SmartLocationDatabaseV3/SmartLocationDatabase.gdb'
print(gpd.list_layers(file_path))


# Select only California Data
sql = f"""
    SELECT * FROM "EPA_SLD_Database_V3"
     WHERE STATEFP = '06'
"""


california_data = gpd.read_file(file_path, engine="pyogrio", sql=sql)
# Write to CSV 
#california_data.to_csv('C:/Users/rdn91/Downloads/WalkabilityIndex/datatable.csv')

# Write to GeoJSON
#raw_df.to_file('/path/to/out_geojson.geojson', driver='GeoJSON')
                  name geometry_type
0  EPA_SLD_Database_V3  MultiPolygon
In [165]:
california_data.head()  #geoid --> census block group
Out[165]:
GEOID10 GEOID20 STATEFP COUNTYFP TRACTCE BLKGRPCE CSA CSA_Name CBSA CBSA_Name CBSA_POP CBSA_EMP CBSA_WRK Ac_Total Ac_Water Ac_Land Ac_Unpr TotPop CountHU HH P_WrkAge AutoOwn0 Pct_AO0 AutoOwn1 Pct_AO1 AutoOwn2p Pct_AO2p Workers R_LowWageWk R_MedWageWk R_HiWageWk R_PCTLOWWAGE TotEmp E5_Ret E5_Off E5_Ind E5_Svc E5_Ent E8_Ret E8_off E8_Ind E8_Svc E8_Ent E8_Ed E8_Hlth E8_Pub E_LowWageWk E_MedWageWk E_HiWageWk E_PctLowWage D1A D1B D1C D1C5_RET D1C5_OFF D1C5_IND D1C5_SVC D1C5_ENT D1C8_RET D1C8_OFF D1C8_IND D1C8_SVC D1C8_ENT D1C8_ED D1C8_HLTH D1C8_PUB D1D D1_FLAG D2A_JPHH D2B_E5MIX D2B_E5MIXA D2B_E8MIX D2B_E8MIXA D2A_EPHHM D2C_TRPMX1 D2C_TRPMX2 D2C_TRIPEQ D2R_JOBPOP D2R_WRKEMP D2A_WRKEMP D2C_WREMLX D3A D3AAO D3AMM D3APO D3B D3BAO D3BMM3 D3BMM4 D3BPO3 D3BPO4 D4A D4B025 D4B050 D4C D4D D4E D5AR D5AE D5BR D5BE D5CR D5CRI D5CE D5CEI D5DR D5DRI D5DE D5DEI D2A_Ranked D2B_Ranked D3B_Ranked D4A_Ranked NatWalkInd Region Households Workers_1 Residents Drivers Vehicles White Male Lowwage Medwage Highwage W_P_Lowwage W_P_Medwage W_P_Highwage GasPrice logd1a logd1c logd3aao logd3apo d4bo25 d5dei_1 logd4d UPTpercap B_C_constant B_C_male B_C_ld1c B_C_drvmveh B_C_ld1a B_C_ld3apo B_C_inc1 B_C_gasp B_N_constant B_N_inc2 B_N_inc3 B_N_white B_N_male B_N_drvmveh B_N_gasp B_N_ld1a B_N_ld1c B_N_ld3aao B_N_ld3apo B_N_d4bo25 B_N_d5dei B_N_UPTpc C_R_Households C_R_Pop C_R_Workers C_R_Drivers C_R_Vehicles C_R_White C_R_Male C_R_Lowwage C_R_Medwage C_R_Highwage C_R_DrmV NonCom_VMT_Per_Worker Com_VMT_Per_Worker VMT_per_worker VMT_tot_min VMT_tot_max VMT_tot_avg GHG_per_worker Annual_GHG SLC_score Shape_Length Shape_Area geometry
0 060530111013 060530111013 06 053 011101 3 None None 41500 Salinas, CA 433212.0 171278.0 168077.0 346.017589 0.112678 345.904912 339.58397 3226 743 712 0.522 80 0.112360 191 0.268258 441 0.619382 1157 247 479 431 0.213483 1894 122 21 1348 156 247 122 21 1348 42 247 0 114 0 1051 663 180 0.554910 2.187971 9.499859 5.577413 0.359263 0.061840 3.969563 0.459386 0.727361 0.359263 0.061840 3.969563 0.123681 0.727361 0.000000 0.335705 0.000000 7.765384 0.0 2.660112 0.583986 0.583986 0.551339 0.475063 0.708537 0.825457 0.800664 0.535121 0.739844 0.758440 0.610876 6.776505e-01 16.931275 2.424792 1.102102 13.404381 88.229565 3.700439 3.700439 1.850219 103.612290 14.801756 653.13 0.0 0.0 -99999.00 -99999.000000 -99999.000000 7803.0 9168.0 19967.0 12233.0 0.001746 0.194196 0.002093 0.250205 0.008100 0.559723 0.004415 0.307138 16.0 7.0 13.0 14.0 12.833333 Salinas, CA Metro Area 684 1157 3177 1942.16 1326.0 141 1710 247 479 431 0.554910 0.350053 0.095037 342 1.159385 1.883641 1.231041 2.667532 0 0 0 10 3.257603 -0.024294 0.035214 -0.328330 -0.051495 -0.375423 -0.275634 0.001028 2.336001 0.054826 0.066514 -0.187630 0.090756 -0.257335 -0.000358 -0.206061 -0.206061 0.103747 -0.177292 -0.198867 -0.115549 -0.000988 127155 433410 168080 289744.40 249520 0.299342 0.509912 0.218557 0.341141 0.440284 0.316341 3.597743 11.637535 15.235278 9.558585 44.831893 19.442039 13.574633 3529.404532 83.906547 5061.505510 1.400314e+06 MULTIPOLYGON (((-2223551.553 1784238.168, -222...
1 060530111022 060530111022 06 053 011102 2 None None 41500 Salinas, CA 433212.0 171278.0 168077.0 358.705927 0.000000 358.705927 351.93580 4144 1035 940 0.543 14 0.014894 206 0.219149 720 0.765957 2253 434 790 1029 0.192632 412 60 96 68 181 7 60 38 68 37 7 28 116 58 127 148 137 0.308252 2.940877 11.774875 1.170668 0.170486 0.272777 0.193217 0.514298 0.019890 0.170486 0.107974 0.193217 0.105133 0.019890 0.079560 0.329606 0.164803 4.111545 0.0 0.438298 0.837517 0.837517 0.913236 0.913236 0.572388 0.583169 0.626348 0.704283 0.180860 0.309193 5.468447 1.146511e-02 24.214972 1.575386 2.802820 19.836766 143.381182 0.000000 14.273531 10.705148 139.166923 30.331252 1090.33 0.0 0.0 -99999.00 -99999.000000 -99999.000000 8017.0 9431.0 21001.0 13293.0 0.001794 0.199522 0.002153 0.257382 0.008519 0.588709 0.004798 0.333752 12.0 20.0 17.0 13.0 15.333333 Salinas, CA Metro Area 919 2253 4079 2604.80 2376.0 352 2130 434 790 1029 0.308252 0.359223 0.332524 342 1.371403 0.775035 0.946000 3.036719 0 0 0 10 3.257603 -0.024294 0.035214 -0.328330 -0.051495 -0.375423 -0.275634 0.001028 2.336001 0.054826 0.066514 -0.187630 0.090756 -0.257335 -0.000358 -0.206061 -0.206061 0.103747 -0.177292 -0.198867 -0.115549 -0.000988 127155 433410 168080 289744.40 249520 0.299342 0.509912 0.218557 0.341141 0.440284 0.316341 3.987867 10.315683 14.303550 9.558585 44.831893 19.442039 12.744463 3313.560384 86.548001 7248.395767 1.451665e+06 MULTIPOLYGON (((-2224166.846 1785356.591, -222...
2 060971513084 060971513084 06 097 151308 4 488 San Jose-San Francisco-Oakland, CA 42220 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 67.135587 0.000000 67.135587 60.29173 396 169 131 0.490 0 0.000000 10 0.076336 121 0.923664 276 46 78 152 0.166667 30 4 0 16 10 0 4 0 16 9 0 0 1 0 6 9 15 0.200000 2.803038 6.568065 0.497581 0.066344 0.000000 0.265376 0.165860 0.000000 0.066344 0.000000 0.265376 0.149274 0.000000 0.000000 0.016586 0.000000 3.300619 0.0 0.229008 0.883037 0.602767 0.777957 0.518638 0.477267 0.397559 0.395591 0.095258 0.140845 0.196078 9.200000 2.746536e-04 22.454792 0.000000 0.064477 22.390315 149.409879 0.000000 0.000000 0.000000 181.125994 28.598841 615.57 0.0 0.0 0.67 6.387075 0.001692 20817.0 23238.0 3423.0 4467.0 0.002432 0.470739 0.002547 0.504417 0.001056 0.058551 0.001686 0.098464 9.0 9.0 17.0 15.0 13.666667 Santa Rosa-Petaluma, CA Metro Area 160 276 464 273.68 NaN 324 244 46 78 152 0.200000 0.300000 0.500000 342 1.335800 0.403851 0.000000 3.152322 0 0 0 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.012110 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 4.586229 36.685037 41.271266 18.791349 72.049792 39.609060 36.772698 9560.901540 57.790885 2700.963896 2.716912e+05 MULTIPOLYGON (((-2281091.02 2025181.953, -2281...
3 060510001022 060510001022 06 051 000102 2 None None None 0.0 0.0 0.0 107095.367112 49.323878 107046.043234 6959.85118 935 508 327 0.451 24 0.073394 106 0.324159 197 0.602446 316 82 113 121 0.259494 26 12 2 6 5 1 12 2 6 0 1 0 5 0 11 9 6 0.423077 0.072990 0.134342 0.003736 0.001724 0.000287 0.000862 0.000718 0.000144 0.001724 0.000287 0.000862 0.000000 0.000144 0.000000 0.000718 0.000000 0.076726 0.0 0.079511 0.829425 0.829425 0.829425 0.641955 0.201652 0.271701 0.290852 0.001513 0.000000 0.000000 12.153846 1.432010e-05 0.954596 0.109177 0.106369 0.739051 0.450593 0.089681 0.125553 0.017936 0.460363 0.041851 590.09 0.0 0.0 -99999.00 -99999.000000 -99999.000000 63.0 384.0 26.0 316.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 13.0 1.0 15.0 8.000000 Mono County 309 316 928 680.24 NaN 671 376 82 113 121 0.423077 0.346154 0.230769 342 0.070449 0.003729 0.103618 0.553339 0 0 0 0 4.962820 0.287264 0.138923 -0.468211 0.442641 -0.521250 0.068571 -0.007097 2.192546 0.054818 0.060232 -0.169290 0.076734 -0.199936 -0.000238 -0.184867 -0.184867 0.079063 -0.155833 -0.373872 -0.121008 -0.001369 4765 14310 5234 10736.00 5073 0.650454 0.533473 0.357853 0.317730 0.324417 1.188458 6.658247 7.491844 14.150092 10.231154 33.306664 17.789688 12.607732 3278.010232 83.016896 126451.935344 4.334090e+08 MULTIPOLYGON (((-2025810.921 1952822.52, -2025...
4 060971513092 060971513092 06 097 151309 2 488 San Jose-San Francisco-Oakland, CA 42220 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 570.037214 0.000000 570.037214 471.33595 2560 896 869 0.593 8 0.009206 200 0.230150 661 0.760644 1406 289 358 759 0.205548 63 1 0 24 36 2 1 0 24 13 2 5 18 0 20 19 24 0.317460 1.900980 5.431370 0.133663 0.002122 0.000000 0.050919 0.076379 0.004243 0.002122 0.000000 0.050919 0.027581 0.004243 0.010608 0.038189 0.000000 2.034642 0.0 0.072497 0.622319 0.536037 0.796767 0.686537 0.189938 0.234089 0.244020 0.001481 0.048037 0.085773 22.317460 5.520070e-10 9.245028 1.434447 1.089662 6.720919 42.302361 4.490935 1.122734 2.245467 37.050213 14.595538 742.98 0.0 0.0 0.67 0.752232 0.000262 23024.0 26141.0 265.0 2111.0 0.002689 0.520646 0.002865 0.567431 0.000082 0.004533 0.000797 0.046532 3.0 15.0 9.0 14.0 10.666667 Santa Rosa-Petaluma, CA Metro Area 966 1406 2882 2041.60 2169.0 1853 1286 289 358 759 0.317460 0.301587 0.380952 342 1.065048 0.125454 0.889720 2.043933 0 0 1 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.012110 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 6.306880 45.213616 51.520496 18.791349 72.049792 39.609060 45.904762 11935.238176 38.546556 7055.254272 2.306910e+06 MULTIPOLYGON (((-2279605.043 2026679.113, -227...
In [166]:
california_data.shape
Out[166]:
(23212, 182)

Assign counties to outcome variable¶

In [3]:
# Get county names given coutny_fips codes
county_info = pd.read_csv('C:/Users/rdn91/OneDrive/Desktop/Coding/Python_Projects/california_counties/data/California_Counties.csv')
county_info['CNTY_FIPS'] = county_info['CNTY_FIPS'].astype("str")
county_info['CNTY_FIPS'] = '0' + county_info['CNTY_FIPS']
county_info = county_info.rename(columns = {'CNTY_FIPS' : 'COUNTYFP', 'NAME' : 'COUNTY_NAME'})
county_info.drop(['OBJECTID', "STATE_NAME", "STATE_FIPS", "FIPS", "Shape__Area", "Shape__Length"], axis=1, inplace=True)
county_info.head()
Out[3]:
COUNTY_NAME COUNTYFP
0 Alameda County 01
1 Alpine County 03
2 Amador County 05
3 Butte County 07
4 Calaveras County 09
In [4]:
# Join California data and county_info
california_data = pd.merge(california_data, county_info, how='inner', on = "COUNTYFP")
california_data.head()
Out[4]:
GEOID10 GEOID20 STATEFP COUNTYFP TRACTCE BLKGRPCE CSA CSA_Name CBSA CBSA_Name CBSA_POP CBSA_EMP CBSA_WRK Ac_Total Ac_Water Ac_Land Ac_Unpr TotPop CountHU HH P_WrkAge AutoOwn0 Pct_AO0 AutoOwn1 Pct_AO1 AutoOwn2p Pct_AO2p Workers R_LowWageWk R_MedWageWk R_HiWageWk R_PCTLOWWAGE TotEmp E5_Ret E5_Off E5_Ind E5_Svc E5_Ent E8_Ret E8_off E8_Ind E8_Svc E8_Ent E8_Ed E8_Hlth E8_Pub E_LowWageWk E_MedWageWk E_HiWageWk E_PctLowWage D1A D1B D1C D1C5_RET D1C5_OFF D1C5_IND D1C5_SVC D1C5_ENT D1C8_RET D1C8_OFF D1C8_IND D1C8_SVC D1C8_ENT D1C8_ED D1C8_HLTH D1C8_PUB D1D D1_FLAG D2A_JPHH D2B_E5MIX D2B_E5MIXA D2B_E8MIX D2B_E8MIXA D2A_EPHHM D2C_TRPMX1 D2C_TRPMX2 D2C_TRIPEQ D2R_JOBPOP D2R_WRKEMP D2A_WRKEMP D2C_WREMLX D3A D3AAO D3AMM D3APO D3B D3BAO D3BMM3 D3BMM4 D3BPO3 D3BPO4 D4A D4B025 D4B050 D4C D4D D4E D5AR D5AE D5BR D5BE D5CR D5CRI D5CE D5CEI D5DR D5DRI D5DE D5DEI D2A_Ranked D2B_Ranked D3B_Ranked D4A_Ranked NatWalkInd Region Households Workers_1 Residents Drivers Vehicles White Male Lowwage Medwage Highwage W_P_Lowwage W_P_Medwage W_P_Highwage GasPrice logd1a logd1c logd3aao logd3apo d4bo25 d5dei_1 logd4d UPTpercap B_C_constant B_C_male B_C_ld1c B_C_drvmveh B_C_ld1a B_C_ld3apo B_C_inc1 B_C_gasp B_N_constant B_N_inc2 B_N_inc3 B_N_white B_N_male B_N_drvmveh B_N_gasp B_N_ld1a B_N_ld1c B_N_ld3aao B_N_ld3apo B_N_d4bo25 B_N_d5dei B_N_UPTpc C_R_Households C_R_Pop C_R_Workers C_R_Drivers C_R_Vehicles C_R_White C_R_Male C_R_Lowwage C_R_Medwage C_R_Highwage C_R_DrmV NonCom_VMT_Per_Worker Com_VMT_Per_Worker VMT_per_worker VMT_tot_min VMT_tot_max VMT_tot_avg GHG_per_worker Annual_GHG SLC_score Shape_Length Shape_Area geometry COUNTY_NAME
0 060530111013 060530111013 06 053 011101 3 None None 41500 Salinas, CA 433212.0 171278.0 168077.0 346.017589 0.112678 345.904912 339.58397 3226 743 712 0.522 80 0.112360 191 0.268258 441 0.619382 1157 247 479 431 0.213483 1894 122 21 1348 156 247 122 21 1348 42 247 0 114 0 1051 663 180 0.554910 2.187971 9.499859 5.577413 0.359263 0.061840 3.969563 0.459386 0.727361 0.359263 0.061840 3.969563 0.123681 0.727361 0.000000 0.335705 0.000000 7.765384 0.0 2.660112 0.583986 0.583986 0.551339 0.475063 0.708537 0.825457 0.800664 0.535121 0.739844 0.758440 0.610876 6.776505e-01 16.931275 2.424792 1.102102 13.404381 88.229565 3.700439 3.700439 1.850219 103.612290 14.801756 653.13 0.0 0.0 -99999.00 -99999.000000 -99999.000000 7803.0 9168.0 19967.0 12233.0 0.001746 0.194196 0.002093 0.250205 0.008100 0.559723 0.004415 0.307138 16.0 7.0 13.0 14.0 12.833333 Salinas, CA Metro Area 684 1157 3177 1942.16 1326.0 141 1710 247 479 431 0.554910 0.350053 0.095037 342 1.159385 1.883641 1.231041 2.667532 0 0 0 10 3.257603 -0.024294 0.035214 -0.328330 -0.051495 -0.375423 -0.275634 0.001028 2.336001 0.054826 0.066514 -0.187630 0.090756 -0.257335 -0.000358 -0.206061 -0.206061 0.103747 -0.177292 -0.198867 -0.115549 -0.000988 127155 433410 168080 289744.40 249520 0.299342 0.509912 0.218557 0.341141 0.440284 0.316341 3.597743 11.637535 15.235278 9.558585 44.831893 19.442039 13.574633 3529.404532 83.906547 5061.505510 1.400314e+06 MULTIPOLYGON (((-2223551.553 1784238.168, -222... Monterey County
1 060530111022 060530111022 06 053 011102 2 None None 41500 Salinas, CA 433212.0 171278.0 168077.0 358.705927 0.000000 358.705927 351.93580 4144 1035 940 0.543 14 0.014894 206 0.219149 720 0.765957 2253 434 790 1029 0.192632 412 60 96 68 181 7 60 38 68 37 7 28 116 58 127 148 137 0.308252 2.940877 11.774875 1.170668 0.170486 0.272777 0.193217 0.514298 0.019890 0.170486 0.107974 0.193217 0.105133 0.019890 0.079560 0.329606 0.164803 4.111545 0.0 0.438298 0.837517 0.837517 0.913236 0.913236 0.572388 0.583169 0.626348 0.704283 0.180860 0.309193 5.468447 1.146511e-02 24.214972 1.575386 2.802820 19.836766 143.381182 0.000000 14.273531 10.705148 139.166923 30.331252 1090.33 0.0 0.0 -99999.00 -99999.000000 -99999.000000 8017.0 9431.0 21001.0 13293.0 0.001794 0.199522 0.002153 0.257382 0.008519 0.588709 0.004798 0.333752 12.0 20.0 17.0 13.0 15.333333 Salinas, CA Metro Area 919 2253 4079 2604.80 2376.0 352 2130 434 790 1029 0.308252 0.359223 0.332524 342 1.371403 0.775035 0.946000 3.036719 0 0 0 10 3.257603 -0.024294 0.035214 -0.328330 -0.051495 -0.375423 -0.275634 0.001028 2.336001 0.054826 0.066514 -0.187630 0.090756 -0.257335 -0.000358 -0.206061 -0.206061 0.103747 -0.177292 -0.198867 -0.115549 -0.000988 127155 433410 168080 289744.40 249520 0.299342 0.509912 0.218557 0.341141 0.440284 0.316341 3.987867 10.315683 14.303550 9.558585 44.831893 19.442039 12.744463 3313.560384 86.548001 7248.395767 1.451665e+06 MULTIPOLYGON (((-2224166.846 1785356.591, -222... Monterey County
2 060971513084 060971513084 06 097 151308 4 488 San Jose-San Francisco-Oakland, CA 42220 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 67.135587 0.000000 67.135587 60.29173 396 169 131 0.490 0 0.000000 10 0.076336 121 0.923664 276 46 78 152 0.166667 30 4 0 16 10 0 4 0 16 9 0 0 1 0 6 9 15 0.200000 2.803038 6.568065 0.497581 0.066344 0.000000 0.265376 0.165860 0.000000 0.066344 0.000000 0.265376 0.149274 0.000000 0.000000 0.016586 0.000000 3.300619 0.0 0.229008 0.883037 0.602767 0.777957 0.518638 0.477267 0.397559 0.395591 0.095258 0.140845 0.196078 9.200000 2.746536e-04 22.454792 0.000000 0.064477 22.390315 149.409879 0.000000 0.000000 0.000000 181.125994 28.598841 615.57 0.0 0.0 0.67 6.387075 0.001692 20817.0 23238.0 3423.0 4467.0 0.002432 0.470739 0.002547 0.504417 0.001056 0.058551 0.001686 0.098464 9.0 9.0 17.0 15.0 13.666667 Santa Rosa-Petaluma, CA Metro Area 160 276 464 273.68 NaN 324 244 46 78 152 0.200000 0.300000 0.500000 342 1.335800 0.403851 0.000000 3.152322 0 0 0 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.012110 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 4.586229 36.685037 41.271266 18.791349 72.049792 39.609060 36.772698 9560.901540 57.790885 2700.963896 2.716912e+05 MULTIPOLYGON (((-2281091.02 2025181.953, -2281... Sonoma County
3 060510001022 060510001022 06 051 000102 2 None None None 0.0 0.0 0.0 107095.367112 49.323878 107046.043234 6959.85118 935 508 327 0.451 24 0.073394 106 0.324159 197 0.602446 316 82 113 121 0.259494 26 12 2 6 5 1 12 2 6 0 1 0 5 0 11 9 6 0.423077 0.072990 0.134342 0.003736 0.001724 0.000287 0.000862 0.000718 0.000144 0.001724 0.000287 0.000862 0.000000 0.000144 0.000000 0.000718 0.000000 0.076726 0.0 0.079511 0.829425 0.829425 0.829425 0.641955 0.201652 0.271701 0.290852 0.001513 0.000000 0.000000 12.153846 1.432010e-05 0.954596 0.109177 0.106369 0.739051 0.450593 0.089681 0.125553 0.017936 0.460363 0.041851 590.09 0.0 0.0 -99999.00 -99999.000000 -99999.000000 63.0 384.0 26.0 316.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 13.0 1.0 15.0 8.000000 Mono County 309 316 928 680.24 NaN 671 376 82 113 121 0.423077 0.346154 0.230769 342 0.070449 0.003729 0.103618 0.553339 0 0 0 0 4.962820 0.287264 0.138923 -0.468211 0.442641 -0.521250 0.068571 -0.007097 2.192546 0.054818 0.060232 -0.169290 0.076734 -0.199936 -0.000238 -0.184867 -0.184867 0.079063 -0.155833 -0.373872 -0.121008 -0.001369 4765 14310 5234 10736.00 5073 0.650454 0.533473 0.357853 0.317730 0.324417 1.188458 6.658247 7.491844 14.150092 10.231154 33.306664 17.789688 12.607732 3278.010232 83.016896 126451.935344 4.334090e+08 MULTIPOLYGON (((-2025810.921 1952822.52, -2025... Mono County
4 060971513092 060971513092 06 097 151309 2 488 San Jose-San Francisco-Oakland, CA 42220 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 570.037214 0.000000 570.037214 471.33595 2560 896 869 0.593 8 0.009206 200 0.230150 661 0.760644 1406 289 358 759 0.205548 63 1 0 24 36 2 1 0 24 13 2 5 18 0 20 19 24 0.317460 1.900980 5.431370 0.133663 0.002122 0.000000 0.050919 0.076379 0.004243 0.002122 0.000000 0.050919 0.027581 0.004243 0.010608 0.038189 0.000000 2.034642 0.0 0.072497 0.622319 0.536037 0.796767 0.686537 0.189938 0.234089 0.244020 0.001481 0.048037 0.085773 22.317460 5.520070e-10 9.245028 1.434447 1.089662 6.720919 42.302361 4.490935 1.122734 2.245467 37.050213 14.595538 742.98 0.0 0.0 0.67 0.752232 0.000262 23024.0 26141.0 265.0 2111.0 0.002689 0.520646 0.002865 0.567431 0.000082 0.004533 0.000797 0.046532 3.0 15.0 9.0 14.0 10.666667 Santa Rosa-Petaluma, CA Metro Area 966 1406 2882 2041.60 2169.0 1853 1286 289 358 759 0.317460 0.301587 0.380952 342 1.065048 0.125454 0.889720 2.043933 0 0 1 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.012110 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 6.306880 45.213616 51.520496 18.791349 72.049792 39.609060 45.904762 11935.238176 38.546556 7055.254272 2.306910e+06 MULTIPOLYGON (((-2279605.043 2026679.113, -227... Sonoma County
In [5]:
# Number of census blocks for each county
print(len(california_data['COUNTY_NAME'].value_counts()))
california_data['COUNTY_NAME'].value_counts()
45
Out[5]:
COUNTY_NAME
Los Angeles County        6425
Orange County             1823
San Diego County          1795
San Bernardino County     1092
Santa Clara County        1075
Riverside County          1030
Sacramento County          912
Contra Costa County        637
Fresno County              589
San Francisco County       581
San Mateo County           463
Kern County                456
San Joaquin County         395
Sonoma County              387
Santa Barbara County       313
Stanislaus County          308
Solano County              285
Monterey County            233
Placer County              213
Santa Cruz County          196
Marin County               175
San Luis Obispo County     163
Merced County              144
Shasta County              131
El Dorado County           125
Humboldt County            108
Napa County                106
Imperial County             96
Kings County                81
Madera County               80
Mendocino County            79
Nevada County               75
Lake County                 48
San Benito County           40
Siskiyou County             37
Lassen County               25
Glenn County                23
Plumas County               22
Colusa County               20
Del Norte County            20
Mono County                 17
Mariposa County             17
Inyo County                 16
Modoc County                12
Sierra County                5
Name: count, dtype: int64
In [6]:
# Create target labels
california_data['target'] = 'Unknown County'
california_data.loc[california_data['COUNTY_NAME'].isin(['Butte County', 'Colusa County', 'Del Norte County', 
                                                       'Glenn County', 'Humboldt County', 'Lake County',
                                                       'Lassen County', 'Mendocino County', 'Modoc County',
                                                       'Nevada County', 'Plumas County', 'Shasta County',
                                                       'Sierra County', 'Siskiyou County', 'Tehama County',
                                                       'Trinity County']), 'target'] = 'Northern California'
california_data.loc[california_data['COUNTY_NAME'].isin(['Alameda County', 'Contra Costa County', 'Marin County',
                                                       'Monterey County', 'Napa County', 'San Benito County', 
                                                       'San Francisco County', 'San Mateo County', 'Santa Clara County',
                                                       'Santa Cruz County', 'Solano County', 'Sonoma County'
                                                       ]), 'target'] = 'Bay Area'
california_data.loc[california_data['COUNTY_NAME'].isin(['Imperial County', 'Kern County', 'Los Angeles County', 
                                                       'Orange County', 'Riverside County', 'San Bernardino County',
                                                       'San Diego County', 'San Luis Obispo County', 'Santa Barbara County',
                                                       'Ventura County', 'Tri-City County']), 'target'] = 'Southern California'
california_data.loc[california_data['COUNTY_NAME'].isin(['Alpine County', 'Amador County', 'Calaveras County',
                                                       'El Dorado County', 'Fresno County', 'Inyo County',
                                                       'Kings County', 'Madera County', 'Mariposa County',
                                                       'Merced County', 'Mono County', 'Placer County',
                                                       'Sacramento County', 'San Joaquin County', 'Stanislaus County',
                                                       'Sutter County', 'Yuba County', 'Tulare County',
                                                       'Tuolumne County', 'Yolo County']), 'target'] = 'Central California'
print(len(california_data['target'].value_counts()))
california_data['target'].value_counts()
4
Out[6]:
target
Southern California    13193
Bay Area                4178
Central California      2897
Northern California      605
Name: count, dtype: int64
In [7]:
california_data.shape
Out[7]:
(20873, 184)

Missing Values/Outliers/Errors¶

In [27]:
# See which columns have nulls and how many
null_cols = california_data.columns[california_data.isna().any()]
null_counts = california_data[null_cols].isna().sum()
print(null_counts)
CSA          3505
CSA_Name     3505
CBSA_Name     169
D1C8_OFF       20
Vehicles     2266
dtype: int64
In [39]:
sns.histplot(california_data, x= 'Ac_Land', log_scale=True)
Out[39]:
<AxesSubplot: xlabel='Ac_Land', ylabel='Count'>
No description has been provided for this image
In [ ]:
sns.histplot(california_data, )

Investigate Vehicles column¶

In [173]:
california_data['Vehicles'].value_counts()
Out[173]:
Vehicles
762.0     30
815.0     28
898.0     27
747.0     26
951.0     26
          ..
3638.0     1
2735.0     1
4953.0     1
3593.0     1
235.0      1
Name: count, Length: 2778, dtype: int64
In [174]:
null_vehicle_rows = california_data[california_data['Vehicles'].isna()]
(null_vehicle_rows['COUNTY_NAME'].value_counts() / california_data['COUNTY_NAME'].value_counts()).sort_values(ascending=False)
Out[174]:
COUNTY_NAME
Mono County               0.470588
Sierra County             0.400000
El Dorado County          0.184000
Imperial County           0.156250
Madera County             0.150000
Del Norte County          0.150000
Contra Costa County       0.147567
Plumas County             0.136364
Placer County             0.131455
Glenn County              0.130435
Kern County               0.127193
Marin County              0.125714
Orange County             0.125617
San Bernardino County     0.122711
Los Angeles County        0.120623
Nevada County             0.120000
Santa Barbara County      0.118211
Mariposa County           0.117647
San Mateo County          0.107991
Sonoma County             0.105943
Lake County               0.104167
San Diego County          0.103621
Santa Clara County        0.103256
Humboldt County           0.101852
Santa Cruz County         0.096939
Fresno County             0.095076
Monterey County           0.094421
San Francisco County      0.087780
Riverside County          0.084466
Modoc County              0.083333
Lassen County             0.080000
Napa County               0.075472
Kings County              0.074074
Sacramento County         0.070175
Solano County             0.070175
Inyo County               0.062500
Stanislaus County         0.061688
San Luis Obispo County    0.061350
Merced County             0.055556
San Joaquin County        0.050633
Colusa County             0.050000
Shasta County             0.045802
Mendocino County          0.037975
San Benito County         0.025000
Siskiyou County                NaN
Name: count, dtype: float64
In [175]:
null_amount = null_vehicle_rows[california_data['COUNTY_NAME'].isin(['Mono County', 'Sierra County'])]
null_amount['COUNTY_NAME'].value_counts()
c:\Users\rdn91\AppData\Local\Programs\Python\Python311\Lib\site-packages\geopandas\geodataframe.py:1750: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  result = super().__getitem__(key)
Out[175]:
COUNTY_NAME
Mono County      8
Sierra County    2
Name: count, dtype: int64
In [176]:
original_amount = california_data[california_data['COUNTY_NAME'].isin(['Mono County', 'Sierra County'])]
original_amount['COUNTY_NAME'].value_counts()
Out[176]:
COUNTY_NAME
Mono County      17
Sierra County     5
Name: count, dtype: int64
In [177]:
mono_sierra = california_data[california_data['COUNTY_NAME'].isin(['Mono County', 'Sierra County'])]
other_counties =  california_data[california_data['COUNTY_NAME'].isin(['Orange County'])]
x_var = "Vehicles"
fig, ax = plt.subplots(2,2, figsize=(20,10))
sns.histplot(data=mono_sierra, x=x_var, hue="COUNTY_NAME", ax =ax[0, 0]).set_title("Histogram of Vehicles")
sns.kdeplot(data=mono_sierra, x=x_var, hue="COUNTY_NAME", ax=ax[0,1]).set_title("KDE plot of Vehicles")
sns.histplot(data=other_counties, x=x_var, ax =ax[1, 0]).set_title("Histogram of Vehicles for all other counties")
sns.kdeplot(data=other_counties, x=x_var, ax=ax[1,1]).set_title("KDE plot of Vehicles for all other counties")
plt.subplots_adjust(hspace=0.3)
No description has been provided for this image

After reading the supporting data dictionaries, there is not enough information to understand what the "Vehicles" column means. Also, since the counties don't have that much data, the "discreteness" can be quite an issue later on. Therefore, I will drop it.

Investigate D1C8_OFF column¶

In [178]:
california_data['D1C8_OFF'].value_counts().sort_values()
Out[178]:
D1C8_OFF
0.064529       1
1.710945       1
0.108574       1
0.013567       1
0.667934       1
            ... 
0.170265       2
0.172473       2
0.625374       2
0.061783       2
0.000000    5946
Name: count, Length: 14902, dtype: int64
In [179]:
sns.kdeplot(data=california_data, x="D1C8_OFF")
Out[179]:
<AxesSubplot: xlabel='D1C8_OFF', ylabel='Density'>
No description has been provided for this image
In [180]:
null_dlc_rows = california_data[california_data['D1C8_OFF'].isna()]
(null_dlc_rows['COUNTY_NAME'].value_counts() / california_data['COUNTY_NAME'].value_counts()).sort_values(ascending=False)
Out[180]:
COUNTY_NAME
Del Norte County          0.050000
Mendocino County          0.012658
Humboldt County           0.009259
El Dorado County          0.008000
San Luis Obispo County    0.006135
Marin County              0.005714
Santa Cruz County         0.005102
Placer County             0.004695
Monterey County           0.004292
Santa Barbara County      0.003195
Sonoma County             0.002584
San Mateo County          0.002160
San Francisco County      0.001721
Contra Costa County       0.001570
Los Angeles County        0.000623
San Diego County          0.000557
Orange County             0.000549
Colusa County                  NaN
Fresno County                  NaN
Glenn County                   NaN
Imperial County                NaN
Inyo County                    NaN
Kern County                    NaN
Kings County                   NaN
Lake County                    NaN
Lassen County                  NaN
Madera County                  NaN
Mariposa County                NaN
Merced County                  NaN
Modoc County                   NaN
Mono County                    NaN
Napa County                    NaN
Nevada County                  NaN
Plumas County                  NaN
Riverside County               NaN
Sacramento County              NaN
San Benito County              NaN
San Bernardino County          NaN
San Joaquin County             NaN
Santa Clara County             NaN
Shasta County                  NaN
Sierra County                  NaN
Siskiyou County                NaN
Solano County                  NaN
Stanislaus County              NaN
Name: count, dtype: float64
In [181]:
# Let's look at the distribution for each county that has a missing value for dlc8_off
null_dlc_counties = null_dlc_rows['COUNTY_NAME'].values.tolist()
dlc_cols = null_dlc_rows.columns.tolist()
dlc_cols = [element for element in dlc_cols if element.startswith("D1C")]
dlc_cols
Out[181]:
['D1C',
 'D1C5_RET',
 'D1C5_OFF',
 'D1C5_IND',
 'D1C5_SVC',
 'D1C5_ENT',
 'D1C8_RET',
 'D1C8_OFF',
 'D1C8_IND',
 'D1C8_SVC',
 'D1C8_ENT',
 'D1C8_ED',
 'D1C8_HLTH',
 'D1C8_PUB']
In [76]:
# Find all columns that have weird values (-9999 or 0)
negative_cols = california_data.columns[california_data.isin([-99999.0]).any()].tolist()
zero_val_cols = california_data.columns[california_data.isin([0]).any()].tolist()
print(negative_cols)
print(zero_val_cols)
['D4A', 'D4C', 'D4D', 'D4E', 'D5BR', 'D5BE', 'D5DR', 'D5DRI', 'D5DE', 'D5DEI']
['CBSA_POP', 'CBSA_EMP', 'CBSA_WRK', 'Ac_Water', 'Ac_Land', 'Ac_Unpr', 'TotPop', 'CountHU', 'HH', 'P_WrkAge', 'AutoOwn0', 'Pct_AO0', 'AutoOwn1', 'Pct_AO1', 'AutoOwn2p', 'Pct_AO2p', 'Workers', 'R_LowWageWk', 'R_MedWageWk', 'R_HiWageWk', 'R_PCTLOWWAGE', 'TotEmp', 'E5_Ret', 'E5_Off', 'E5_Ind', 'E5_Svc', 'E5_Ent', 'E8_Ret', 'E8_off', 'E8_Ind', 'E8_Svc', 'E8_Ent', 'E8_Ed', 'E8_Hlth', 'E8_Pub', 'E_LowWageWk', 'E_MedWageWk', 'E_HiWageWk', 'E_PctLowWage', 'D1A', 'D1B', 'D1C', 'D1C5_RET', 'D1C5_OFF', 'D1C5_IND', 'D1C5_SVC', 'D1C5_ENT', 'D1C8_RET', 'D1C8_OFF', 'D1C8_IND', 'D1C8_SVC', 'D1C8_ENT', 'D1C8_ED', 'D1C8_HLTH', 'D1C8_PUB', 'D1D', 'D1_FLAG', 'D2A_JPHH', 'D2B_E5MIX', 'D2B_E5MIXA', 'D2B_E8MIX', 'D2B_E8MIXA', 'D2A_EPHHM', 'D2C_TRPMX1', 'D2C_TRPMX2', 'D2C_TRIPEQ', 'D2R_JOBPOP', 'D2R_WRKEMP', 'D2A_WRKEMP', 'D2C_WREMLX', 'D3A', 'D3AAO', 'D3AMM', 'D3APO', 'D3B', 'D3BAO', 'D3BMM3', 'D3BMM4', 'D3BPO3', 'D3BPO4', 'D4A', 'D4B025', 'D4B050', 'D4E', 'D5AR', 'D5AE', 'D5BR', 'D5CR', 'D5CRI', 'D5CE', 'D5CEI', 'D5DR', 'D5DRI', 'D5DE', 'D5DEI', 'Households', 'Residents', 'Drivers', 'White', 'Male', 'Lowwage', 'Medwage', 'Highwage', 'W_P_Lowwage', 'W_P_Medwage', 'W_P_Highwage', 'logd1a', 'logd1c', 'logd3aao', 'logd3apo', 'd4bo25', 'd5dei_1', 'logd4d', 'UPTpercap', 'B_N_UPTpc', 'SLC_score']
In [ ]:
plot_cols(california_data, dlc_cols, 4, "histplot")

All the D1C columns have very similar distributions and are the only columns that seem to have "-99999" value.

In [184]:
row_length = []
for column in negative_cols:
    temp = null_dlc_rows[null_dlc_rows[column] == -99999.0]
    rows, columns = temp.shape
    row_length.append(rows)

if len(set(row_length)) == 1:
    print("All of the D1C8_OFF columns that have a -99999 values also has the same value for other D1C columns.")
else:
    print("Not all D1C columns match one to one in terms of rows with -99999 values.")

print(row_length)

california_data[california_data['D1C8_OFF'].isna()].shape
All of the D1C8_OFF columns that have a -99999 values also has the same value for other D1C columns.
[20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
Out[184]:
(20, 184)
In [185]:
# Is it the same for the original data?
row_length = []
for column in negative_cols:
    temp = california_data[california_data[column] == -99999.0]
    rows, columns = temp.shape
    row_length.append(rows)

if len(set(row_length)) == 1:
    print("All of the D1C8_OFF columns that have a -99999 values also has the same value for other D1C columns.")
else:
    print("Not all D1C columns match one to one in terms of rows with -99999 values.")

print(row_length)
Not all D1C columns match one to one in terms of rows with -99999 values.
[5289, 5931, 5931, 5931, 5289, 5289, 5289, 5289, 5289, 5289]
In [77]:
delete_negative_val = california_data[(california_data != -99999.0).all(axis=1)]
delete_negative_val.shape
Out[77]:
(13471, 184)
In [78]:
delete_negative_val.columns[delete_negative_val.isin([-99999.0]).any()].tolist()
Out[78]:
[]
In [79]:
delete_negative_val.columns[delete_negative_val.isin([-99999.0]).any()].tolist()
Out[79]:
[]
In [80]:
california_data = delete_negative_val.dropna(subset=negative_cols, how='any')
california_data.shape
Out[80]:
(13471, 184)

Check nulls and weird values again¶

In [81]:
california_data[(california_data == -99999).any(axis=1)]
Out[81]:
GEOID10 GEOID20 STATEFP COUNTYFP TRACTCE BLKGRPCE CSA CSA_Name CBSA CBSA_Name CBSA_POP CBSA_EMP CBSA_WRK Ac_Total Ac_Water Ac_Land Ac_Unpr TotPop CountHU HH P_WrkAge AutoOwn0 Pct_AO0 AutoOwn1 Pct_AO1 AutoOwn2p Pct_AO2p Workers R_LowWageWk R_MedWageWk R_HiWageWk R_PCTLOWWAGE TotEmp E5_Ret E5_Off E5_Ind E5_Svc E5_Ent E8_Ret E8_off E8_Ind E8_Svc E8_Ent E8_Ed E8_Hlth E8_Pub E_LowWageWk E_MedWageWk E_HiWageWk E_PctLowWage D1A D1B D1C D1C5_RET D1C5_OFF D1C5_IND D1C5_SVC D1C5_ENT D1C8_RET D1C8_OFF D1C8_IND D1C8_SVC D1C8_ENT D1C8_ED D1C8_HLTH D1C8_PUB D1D D1_FLAG D2A_JPHH D2B_E5MIX D2B_E5MIXA D2B_E8MIX D2B_E8MIXA D2A_EPHHM D2C_TRPMX1 D2C_TRPMX2 D2C_TRIPEQ D2R_JOBPOP D2R_WRKEMP D2A_WRKEMP D2C_WREMLX D3A D3AAO D3AMM D3APO D3B D3BAO D3BMM3 D3BMM4 D3BPO3 D3BPO4 D4A D4B025 D4B050 D4C D4D D4E D5AR D5AE D5BR D5BE D5CR D5CRI D5CE D5CEI D5DR D5DRI D5DE D5DEI D2A_Ranked D2B_Ranked D3B_Ranked D4A_Ranked NatWalkInd Region Households Workers_1 Residents Drivers Vehicles White Male Lowwage Medwage Highwage W_P_Lowwage W_P_Medwage W_P_Highwage GasPrice logd1a logd1c logd3aao logd3apo d4bo25 d5dei_1 logd4d UPTpercap B_C_constant B_C_male B_C_ld1c B_C_drvmveh B_C_ld1a B_C_ld3apo B_C_inc1 B_C_gasp B_N_constant B_N_inc2 B_N_inc3 B_N_white B_N_male B_N_drvmveh B_N_gasp B_N_ld1a B_N_ld1c B_N_ld3aao B_N_ld3apo B_N_d4bo25 B_N_d5dei B_N_UPTpc C_R_Households C_R_Pop C_R_Workers C_R_Drivers C_R_Vehicles C_R_White C_R_Male C_R_Lowwage C_R_Medwage C_R_Highwage C_R_DrmV NonCom_VMT_Per_Worker Com_VMT_Per_Worker VMT_per_worker VMT_tot_min VMT_tot_max VMT_tot_avg GHG_per_worker Annual_GHG SLC_score Shape_Length Shape_Area geometry COUNTY_NAME target
In [82]:
numeric_cols = california_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
california_data[numeric_cols].columns[(california_data[numeric_cols] < 0 ).all()]   # all the columns that have only negative values (it's not an error)
Out[82]:
Index(['B_N_white', 'B_N_drvmveh', 'B_N_ld1a', 'B_N_ld1c', 'B_N_ld3apo',
       'B_N_d4bo25'],
      dtype='object')
In [83]:
california_data[numeric_cols].columns[(california_data[numeric_cols] < 0 ).any()]  # all the columns that contain at least one negative value
Out[83]:
Index(['B_C_male', 'B_C_ld1c', 'B_C_drvmveh', 'B_C_ld1a', 'B_C_ld3apo',
       'B_C_inc1', 'B_C_gasp', 'B_N_inc2', 'B_N_inc3', 'B_N_white',
       'B_N_drvmveh', 'B_N_gasp', 'B_N_ld1a', 'B_N_ld1c', 'B_N_ld3apo',
       'B_N_d4bo25', 'B_N_d5dei', 'B_N_UPTpc', 'C_R_DrmV'],
      dtype='object')
In [84]:
california_data['B_C_male'].value_counts() # check some of those columns to confirm
Out[84]:
B_C_male
 0.011455    87
 0.113615    81
-0.030399    59
 0.092642    57
 0.075932    55
             ..
-0.120785     1
 0.210880     1
 0.196435     1
 0.146326     1
-0.029296     1
Name: count, Length: 2252, dtype: int64
In [ ]:
california_data.columns[california_data.isna().any()]
Out[ ]:
Index(['CSA', 'CSA_Name', 'CBSA_Name', 'Vehicles'], dtype='object')
In [86]:
# Drop the two null numerical columns
#california_data.drop(['Vehicles'], axis=1, inplace=True)
california_data.head()
Out[86]:
GEOID10 GEOID20 STATEFP COUNTYFP TRACTCE BLKGRPCE CSA CSA_Name CBSA CBSA_Name CBSA_POP CBSA_EMP CBSA_WRK Ac_Total Ac_Water Ac_Land Ac_Unpr TotPop CountHU HH P_WrkAge AutoOwn0 Pct_AO0 AutoOwn1 Pct_AO1 AutoOwn2p Pct_AO2p Workers R_LowWageWk R_MedWageWk R_HiWageWk R_PCTLOWWAGE TotEmp E5_Ret E5_Off E5_Ind E5_Svc E5_Ent E8_Ret E8_off E8_Ind E8_Svc E8_Ent E8_Ed E8_Hlth E8_Pub E_LowWageWk E_MedWageWk E_HiWageWk E_PctLowWage D1A D1B D1C D1C5_RET D1C5_OFF D1C5_IND D1C5_SVC D1C5_ENT D1C8_RET D1C8_OFF D1C8_IND D1C8_SVC D1C8_ENT D1C8_ED D1C8_HLTH D1C8_PUB D1D D1_FLAG D2A_JPHH D2B_E5MIX D2B_E5MIXA D2B_E8MIX D2B_E8MIXA D2A_EPHHM D2C_TRPMX1 D2C_TRPMX2 D2C_TRIPEQ D2R_JOBPOP D2R_WRKEMP D2A_WRKEMP D2C_WREMLX D3A D3AAO D3AMM D3APO D3B D3BAO D3BMM3 D3BMM4 D3BPO3 D3BPO4 D4A D4B025 D4B050 D4C D4D D4E D5AR D5AE D5BR D5BE D5CR D5CRI D5CE D5CEI D5DR D5DRI D5DE D5DEI D2A_Ranked D2B_Ranked D3B_Ranked D4A_Ranked NatWalkInd Region Households Workers_1 Residents Drivers Vehicles White Male Lowwage Medwage Highwage W_P_Lowwage W_P_Medwage W_P_Highwage GasPrice logd1a logd1c logd3aao logd3apo d4bo25 d5dei_1 logd4d UPTpercap B_C_constant B_C_male B_C_ld1c B_C_drvmveh B_C_ld1a B_C_ld3apo B_C_inc1 B_C_gasp B_N_constant B_N_inc2 B_N_inc3 B_N_white B_N_male B_N_drvmveh B_N_gasp B_N_ld1a B_N_ld1c B_N_ld3aao B_N_ld3apo B_N_d4bo25 B_N_d5dei B_N_UPTpc C_R_Households C_R_Pop C_R_Workers C_R_Drivers C_R_Vehicles C_R_White C_R_Male C_R_Lowwage C_R_Medwage C_R_Highwage C_R_DrmV NonCom_VMT_Per_Worker Com_VMT_Per_Worker VMT_per_worker VMT_tot_min VMT_tot_max VMT_tot_avg GHG_per_worker Annual_GHG SLC_score Shape_Length Shape_Area geometry COUNTY_NAME target
2 060971513084 060971513084 06 097 151308 4 488 San Jose-San Francisco-Oakland, CA 42220 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 67.135587 0.0 67.135587 60.291730 396 169 131 0.490 0 0.000000 10 0.076336 121 0.923664 276 46 78 152 0.166667 30 4 0 16 10 0 4 0 16 9 0 0 1 0 6 9 15 0.200000 2.803038 6.568065 0.497581 0.066344 0.000000 0.265376 0.165860 0.000000 0.066344 0.000000 0.265376 0.149274 0.000000 0.000000 0.016586 0.0 3.300619 0.0 0.229008 0.883037 0.602767 0.777957 0.518638 0.477267 0.397559 0.395591 9.525833e-02 0.140845 0.196078 9.200000 2.746536e-04 22.454792 0.000000 0.064477 22.390315 149.409879 0.000000 0.000000 0.000000 181.125994 28.598841 615.57 0.00000 0.000000 0.67 6.387075 0.001692 20817.0 23238.0 3423.0 4467.0 0.002432 0.470739 0.002547 0.504417 0.001056 0.058551 0.001686 0.098464 9.0 9.0 17.0 15.0 13.666667 Santa Rosa-Petaluma, CA Metro Area 160 276 464 273.68 NaN 324 244 46 78 152 0.200000 0.300000 0.500000 342 1.335800 0.403851 0.00000 3.152322 0 0 0 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 4.586229 36.685037 41.271266 18.791349 72.049792 39.60906 36.772698 9560.901540 57.790885 2700.963896 2.716912e+05 MULTIPOLYGON (((-2281091.02 2025181.953, -2281... Sonoma County Bay Area
4 060971513092 060971513092 06 097 151309 2 488 San Jose-San Francisco-Oakland, CA 42220 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 570.037214 0.0 570.037214 471.335950 2560 896 869 0.593 8 0.009206 200 0.230150 661 0.760644 1406 289 358 759 0.205548 63 1 0 24 36 2 1 0 24 13 2 5 18 0 20 19 24 0.317460 1.900980 5.431370 0.133663 0.002122 0.000000 0.050919 0.076379 0.004243 0.002122 0.000000 0.050919 0.027581 0.004243 0.010608 0.038189 0.0 2.034642 0.0 0.072497 0.622319 0.536037 0.796767 0.686537 0.189938 0.234089 0.244020 1.481149e-03 0.048037 0.085773 22.317460 5.520070e-10 9.245028 1.434447 1.089662 6.720919 42.302361 4.490935 1.122734 2.245467 37.050213 14.595538 742.98 0.00000 0.000000 0.67 0.752232 0.000262 23024.0 26141.0 265.0 2111.0 0.002689 0.520646 0.002865 0.567431 0.000082 0.004533 0.000797 0.046532 3.0 15.0 9.0 14.0 10.666667 Santa Rosa-Petaluma, CA Metro Area 966 1406 2882 2041.60 2169.0 1853 1286 289 358 759 0.317460 0.301587 0.380952 342 1.065048 0.125454 0.88972 2.043933 0 0 1 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 6.306880 45.213616 51.520496 18.791349 72.049792 39.60906 45.904762 11935.238176 38.546556 7055.254272 2.306910e+06 MULTIPOLYGON (((-2279605.043 2026679.113, -227... Sonoma County Bay Area
5 060971513093 060971513093 06 097 151309 3 488 San Jose-San Francisco-Oakland, CA 42220 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 982.997406 0.0 982.997406 982.846330 1610 712 648 0.574 77 0.118827 187 0.288580 384 0.592593 665 149 171 345 0.224060 282 0 0 31 245 6 0 0 31 5 6 61 179 0 83 104 95 0.294326 0.724427 1.638099 0.286922 0.000000 0.000000 0.031541 0.249276 0.006105 0.000000 0.000000 0.031541 0.005087 0.006105 0.062065 0.182124 0.0 1.011348 0.0 0.435185 0.406720 0.277630 0.631164 0.488506 0.540334 0.441643 0.477021 9.053520e-01 0.298097 0.595565 2.358156 2.571345e-01 6.367728 0.795925 0.849893 4.721909 21.711227 1.302140 4.557489 0.000000 22.136376 3.906419 1098.38 0.00000 0.000000 0.67 0.436217 0.000416 22544.0 25770.0 368.0 1478.0 0.002633 0.509792 0.002824 0.559378 0.000114 0.006295 0.000558 0.032579 11.0 8.0 7.0 13.0 9.833333 Santa Rosa-Petaluma, CA Metro Area 660 665 1722 1156.32 1084.0 1283 647 149 171 345 0.294326 0.368794 0.336879 342 0.544895 0.252253 0.58552 1.744303 0 0 0 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 7.067722 45.780375 52.848097 18.791349 72.049792 39.60906 47.087654 12242.790059 36.053805 8233.425925 3.978132e+06 MULTIPOLYGON (((-2279501.685 2025691.2, -22794... Sonoma County Bay Area
6 060971513101 060971513101 06 097 151310 1 488 San Jose-San Francisco-Oakland, CA 42220 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 51.480814 0.0 51.480814 51.080456 506 199 182 0.709 0 0.000000 35 0.192308 147 0.807692 268 63 73 132 0.235075 142 14 0 8 19 101 14 0 8 13 101 0 6 0 96 37 9 0.676056 3.895815 9.905941 2.779928 0.274077 0.000000 0.156616 0.371962 1.977273 0.274077 0.000000 0.156616 0.254500 1.977273 0.000000 0.117462 0.0 6.675743 0.0 0.780220 0.650603 0.560399 0.612247 0.473864 0.671536 0.565885 0.621614 5.384277e-01 0.438272 0.692683 1.887324 4.117562e-01 27.510900 0.000000 9.572344 17.938556 116.063433 0.000000 12.431816 24.863632 87.022711 24.863632 580.70 0.00000 0.479764 2.00 24.863632 0.003953 21597.0 25470.0 185.0 1195.0 0.002523 0.488377 0.002791 0.552866 0.000057 0.003164 0.000451 0.026341 15.0 7.0 15.0 15.0 13.666667 Santa Rosa-Petaluma, CA Metro Area 200 268 552 383.68 365.0 317 190 63 73 132 0.676056 0.260563 0.063380 342 1.588381 1.329705 0.00000 2.941200 0 0 3 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 3.675845 24.646966 28.322812 18.791349 72.049792 39.60906 25.235625 6561.262585 82.103377 1863.752846 2.083395e+05 MULTIPOLYGON (((-2280677.682 2023214.453, -228... Sonoma County Bay Area
7 060971513102 060971513102 06 097 151310 2 488 San Jose-San Francisco-Oakland, CA 42220 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 94.734433 0.0 94.734433 89.936999 1944 700 674 0.653 68 0.100890 97 0.143917 509 0.755193 502 93 166 243 0.185259 18 1 1 3 13 0 1 1 3 1 0 0 12 0 11 5 2 0.611111 7.783226 21.615131 0.200140 0.011119 0.011119 0.033357 0.144546 0.000000 0.011119 0.011119 0.033357 0.011119 0.000000 0.000000 0.133427 0.0 7.983366 0.0 0.026706 0.616612 0.531121 0.652815 0.505263 0.088743 0.129022 0.139861 1.035771e-07 0.018349 0.069231 27.888889 2.100409e-12 21.003141 0.000000 4.051075 16.952065 92.364304 0.000000 6.755727 13.511455 101.335910 6.755727 692.02 0.15082 0.597189 2.00 13.511455 0.001029 21512.0 25326.0 162.0 494.0 0.002513 0.486455 0.002775 0.549741 0.000050 0.002771 0.000186 0.010889 1.0 8.0 14.0 14.0 10.833333 Santa Rosa-Petaluma, CA Metro Area 636 502 1741 1295.36 1306.0 1217 915 93 166 243 0.611111 0.277778 0.111111 342 2.172844 0.182438 0.00000 2.887705 0 0 3 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 4.009743 33.743783 37.753526 18.791349 72.049792 39.60906 33.638392 8745.981842 64.395923 2853.102115 3.833834e+05 MULTIPOLYGON (((-2281109.446 2023606.289, -228... Sonoma County Bay Area
In [87]:
print(california_data.shape)
(13471, 184)

Data Transofrmation¶

Work with CSV format for Modeling¶

In [17]:
#california_data.to_csv('C:/Users/rdn91/OneDrive/Desktop/Coding/Python_Projects/california_counties/data/cal_data_delete.csv', index=False)
model_data = pd.read_csv('C:/Users/rdn91/OneDrive/Desktop/Coding/Python_Projects/california_counties/data/cal_data_delete.csv')
model_data.drop(['Region', 'Shape_Length', 'Shape_Area', 'geometry'], axis=1, inplace=True)
print(model_data.shape)
model_data.head()
(13471, 179)
Out[17]:
GEOID10 GEOID20 STATEFP COUNTYFP TRACTCE BLKGRPCE CSA CSA_Name CBSA CBSA_Name CBSA_POP CBSA_EMP CBSA_WRK Ac_Total Ac_Water Ac_Land Ac_Unpr TotPop CountHU HH P_WrkAge AutoOwn0 Pct_AO0 AutoOwn1 Pct_AO1 AutoOwn2p Pct_AO2p Workers R_LowWageWk R_MedWageWk R_HiWageWk R_PCTLOWWAGE TotEmp E5_Ret E5_Off E5_Ind E5_Svc E5_Ent E8_Ret E8_off E8_Ind E8_Svc E8_Ent E8_Ed E8_Hlth E8_Pub E_LowWageWk E_MedWageWk E_HiWageWk E_PctLowWage D1A D1B D1C D1C5_RET D1C5_OFF D1C5_IND D1C5_SVC D1C5_ENT D1C8_RET D1C8_OFF D1C8_IND D1C8_SVC D1C8_ENT D1C8_ED D1C8_HLTH D1C8_PUB D1D D1_FLAG D2A_JPHH D2B_E5MIX D2B_E5MIXA D2B_E8MIX D2B_E8MIXA D2A_EPHHM D2C_TRPMX1 D2C_TRPMX2 D2C_TRIPEQ D2R_JOBPOP D2R_WRKEMP D2A_WRKEMP D2C_WREMLX D3A D3AAO D3AMM D3APO D3B D3BAO D3BMM3 D3BMM4 D3BPO3 D3BPO4 D4A D4B025 D4B050 D4C D4D D4E D5AR D5AE D5BR D5BE D5CR D5CRI D5CE D5CEI D5DR D5DRI D5DE D5DEI D2A_Ranked D2B_Ranked D3B_Ranked D4A_Ranked NatWalkInd Households Workers_1 Residents Drivers White Male Lowwage Medwage Highwage W_P_Lowwage W_P_Medwage W_P_Highwage GasPrice logd1a logd1c logd3aao logd3apo d4bo25 d5dei_1 logd4d UPTpercap B_C_constant B_C_male B_C_ld1c B_C_drvmveh B_C_ld1a B_C_ld3apo B_C_inc1 B_C_gasp B_N_constant B_N_inc2 B_N_inc3 B_N_white B_N_male B_N_drvmveh B_N_gasp B_N_ld1a B_N_ld1c B_N_ld3aao B_N_ld3apo B_N_d4bo25 B_N_d5dei B_N_UPTpc C_R_Households C_R_Pop C_R_Workers C_R_Drivers C_R_Vehicles C_R_White C_R_Male C_R_Lowwage C_R_Medwage C_R_Highwage C_R_DrmV NonCom_VMT_Per_Worker Com_VMT_Per_Worker VMT_per_worker VMT_tot_min VMT_tot_max VMT_tot_avg GHG_per_worker Annual_GHG SLC_score COUNTY_NAME target
0 60971513084 60971513084 6 97 151308 4 488.0 San Jose-San Francisco-Oakland, CA 42220.0 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 67.135587 0.0 67.135587 60.291730 396 169 131 0.490 0 0.000000 10 0.076336 121 0.923664 276 46 78 152 0.166667 30 4 0 16 10 0 4 0 16 9 0 0 1 0 6 9 15 0.200000 2.803038 6.568065 0.497581 0.066344 0.000000 0.265376 0.165860 0.000000 0.066344 0.000000 0.265376 0.149274 0.000000 0.000000 0.016586 0.0 3.300619 0.0 0.229008 0.883037 0.602767 0.777957 0.518638 0.477267 0.397559 0.395591 9.525833e-02 0.140845 0.196078 9.200000 2.746536e-04 22.454792 0.000000 0.064477 22.390315 149.409879 0.000000 0.000000 0.000000 181.125994 28.598841 615.57 0.00000 0.000000 0.67 6.387075 0.001692 20817.0 23238.0 3423.0 4467.0 0.002432 0.470739 0.002547 0.504417 0.001056 0.058551 0.001686 0.098464 9.0 9.0 17.0 15.0 13.666667 160 276 464 273.68 324 244 46 78 152 0.200000 0.300000 0.500000 342 1.335800 0.403851 0.00000 3.152322 0 0 0 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 4.586229 36.685037 41.271266 18.791349 72.049792 39.60906 36.772698 9560.901540 57.790885 Sonoma County Bay Area
1 60971513092 60971513092 6 97 151309 2 488.0 San Jose-San Francisco-Oakland, CA 42220.0 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 570.037214 0.0 570.037214 471.335950 2560 896 869 0.593 8 0.009206 200 0.230150 661 0.760644 1406 289 358 759 0.205548 63 1 0 24 36 2 1 0 24 13 2 5 18 0 20 19 24 0.317460 1.900980 5.431370 0.133663 0.002122 0.000000 0.050919 0.076379 0.004243 0.002122 0.000000 0.050919 0.027581 0.004243 0.010608 0.038189 0.0 2.034642 0.0 0.072497 0.622319 0.536037 0.796767 0.686537 0.189938 0.234089 0.244020 1.481149e-03 0.048037 0.085773 22.317460 5.520070e-10 9.245028 1.434447 1.089662 6.720919 42.302361 4.490935 1.122734 2.245467 37.050213 14.595538 742.98 0.00000 0.000000 0.67 0.752232 0.000262 23024.0 26141.0 265.0 2111.0 0.002689 0.520646 0.002865 0.567431 0.000082 0.004533 0.000797 0.046532 3.0 15.0 9.0 14.0 10.666667 966 1406 2882 2041.60 1853 1286 289 358 759 0.317460 0.301587 0.380952 342 1.065048 0.125454 0.88972 2.043933 0 0 1 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 6.306880 45.213616 51.520496 18.791349 72.049792 39.60906 45.904762 11935.238176 38.546556 Sonoma County Bay Area
2 60971513093 60971513093 6 97 151309 3 488.0 San Jose-San Francisco-Oakland, CA 42220.0 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 982.997406 0.0 982.997406 982.846330 1610 712 648 0.574 77 0.118827 187 0.288580 384 0.592593 665 149 171 345 0.224060 282 0 0 31 245 6 0 0 31 5 6 61 179 0 83 104 95 0.294326 0.724427 1.638099 0.286922 0.000000 0.000000 0.031541 0.249276 0.006105 0.000000 0.000000 0.031541 0.005087 0.006105 0.062065 0.182124 0.0 1.011348 0.0 0.435185 0.406720 0.277630 0.631164 0.488506 0.540334 0.441643 0.477021 9.053520e-01 0.298097 0.595565 2.358156 2.571345e-01 6.367728 0.795925 0.849893 4.721909 21.711227 1.302140 4.557489 0.000000 22.136376 3.906419 1098.38 0.00000 0.000000 0.67 0.436217 0.000416 22544.0 25770.0 368.0 1478.0 0.002633 0.509792 0.002824 0.559378 0.000114 0.006295 0.000558 0.032579 11.0 8.0 7.0 13.0 9.833333 660 665 1722 1156.32 1283 647 149 171 345 0.294326 0.368794 0.336879 342 0.544895 0.252253 0.58552 1.744303 0 0 0 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 7.067722 45.780375 52.848097 18.791349 72.049792 39.60906 47.087654 12242.790059 36.053805 Sonoma County Bay Area
3 60971513101 60971513101 6 97 151310 1 488.0 San Jose-San Francisco-Oakland, CA 42220.0 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 51.480814 0.0 51.480814 51.080456 506 199 182 0.709 0 0.000000 35 0.192308 147 0.807692 268 63 73 132 0.235075 142 14 0 8 19 101 14 0 8 13 101 0 6 0 96 37 9 0.676056 3.895815 9.905941 2.779928 0.274077 0.000000 0.156616 0.371962 1.977273 0.274077 0.000000 0.156616 0.254500 1.977273 0.000000 0.117462 0.0 6.675743 0.0 0.780220 0.650603 0.560399 0.612247 0.473864 0.671536 0.565885 0.621614 5.384277e-01 0.438272 0.692683 1.887324 4.117562e-01 27.510900 0.000000 9.572344 17.938556 116.063433 0.000000 12.431816 24.863632 87.022711 24.863632 580.70 0.00000 0.479764 2.00 24.863632 0.003953 21597.0 25470.0 185.0 1195.0 0.002523 0.488377 0.002791 0.552866 0.000057 0.003164 0.000451 0.026341 15.0 7.0 15.0 15.0 13.666667 200 268 552 383.68 317 190 63 73 132 0.676056 0.260563 0.063380 342 1.588381 1.329705 0.00000 2.941200 0 0 3 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 3.675845 24.646966 28.322812 18.791349 72.049792 39.60906 25.235625 6561.262585 82.103377 Sonoma County Bay Area
4 60971513102 60971513102 6 97 151310 2 488.0 San Jose-San Francisco-Oakland, CA 42220.0 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 94.734433 0.0 94.734433 89.936999 1944 700 674 0.653 68 0.100890 97 0.143917 509 0.755193 502 93 166 243 0.185259 18 1 1 3 13 0 1 1 3 1 0 0 12 0 11 5 2 0.611111 7.783226 21.615131 0.200140 0.011119 0.011119 0.033357 0.144546 0.000000 0.011119 0.011119 0.033357 0.011119 0.000000 0.000000 0.133427 0.0 7.983366 0.0 0.026706 0.616612 0.531121 0.652815 0.505263 0.088743 0.129022 0.139861 1.035771e-07 0.018349 0.069231 27.888889 2.100409e-12 21.003141 0.000000 4.051075 16.952065 92.364304 0.000000 6.755727 13.511455 101.335910 6.755727 692.02 0.15082 0.597189 2.00 13.511455 0.001029 21512.0 25326.0 162.0 494.0 0.002513 0.486455 0.002775 0.549741 0.000050 0.002771 0.000186 0.010889 1.0 8.0 14.0 14.0 10.833333 636 502 1741 1295.36 1217 915 93 166 243 0.611111 0.277778 0.111111 342 2.172844 0.182438 0.00000 2.887705 0 0 3 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 4.009743 33.743783 37.753526 18.791349 72.049792 39.60906 33.638392 8745.981842 64.395923 Sonoma County Bay Area
In [3]:
model_data['GEOID20'].is_unique
Out[3]:
True
In [4]:
# Check distinct values for suspicious columns
model_data['target'].value_counts().sort_values(ascending=False)  #UPTpercap, Ac_Water
Out[4]:
target
Southern California    8968
Bay Area               2517
Central California     1737
Northern California     249
Name: count, dtype: int64
In [8]:
# Separate columns
description_cols = model_data.columns[:10].tolist() + ['COUNTY_NAME']
input_cols = model_data.columns[10:-2].tolist()
target_col = ['target']
print(description_cols)
print(input_cols)
print(target_col)
['GEOID10', 'GEOID20', 'STATEFP', 'COUNTYFP', 'TRACTCE', 'BLKGRPCE', 'CSA', 'CSA_Name', 'CBSA', 'CBSA_Name', 'COUNTY_NAME']
['CBSA_POP', 'CBSA_EMP', 'CBSA_WRK', 'Ac_Total', 'Ac_Water', 'Ac_Land', 'Ac_Unpr', 'TotPop', 'CountHU', 'HH', 'P_WrkAge', 'AutoOwn0', 'Pct_AO0', 'AutoOwn1', 'Pct_AO1', 'AutoOwn2p', 'Pct_AO2p', 'Workers', 'R_LowWageWk', 'R_MedWageWk', 'R_HiWageWk', 'R_PCTLOWWAGE', 'TotEmp', 'E5_Ret', 'E5_Off', 'E5_Ind', 'E5_Svc', 'E5_Ent', 'E8_Ret', 'E8_off', 'E8_Ind', 'E8_Svc', 'E8_Ent', 'E8_Ed', 'E8_Hlth', 'E8_Pub', 'E_LowWageWk', 'E_MedWageWk', 'E_HiWageWk', 'E_PctLowWage', 'D1A', 'D1B', 'D1C', 'D1C5_RET', 'D1C5_OFF', 'D1C5_IND', 'D1C5_SVC', 'D1C5_ENT', 'D1C8_RET', 'D1C8_OFF', 'D1C8_IND', 'D1C8_SVC', 'D1C8_ENT', 'D1C8_ED', 'D1C8_HLTH', 'D1C8_PUB', 'D1D', 'D1_FLAG', 'D2A_JPHH', 'D2B_E5MIX', 'D2B_E5MIXA', 'D2B_E8MIX', 'D2B_E8MIXA', 'D2A_EPHHM', 'D2C_TRPMX1', 'D2C_TRPMX2', 'D2C_TRIPEQ', 'D2R_JOBPOP', 'D2R_WRKEMP', 'D2A_WRKEMP', 'D2C_WREMLX', 'D3A', 'D3AAO', 'D3AMM', 'D3APO', 'D3B', 'D3BAO', 'D3BMM3', 'D3BMM4', 'D3BPO3', 'D3BPO4', 'D4A', 'D4B025', 'D4B050', 'D4C', 'D4D', 'D4E', 'D5AR', 'D5AE', 'D5BR', 'D5BE', 'D5CR', 'D5CRI', 'D5CE', 'D5CEI', 'D5DR', 'D5DRI', 'D5DE', 'D5DEI', 'D2A_Ranked', 'D2B_Ranked', 'D3B_Ranked', 'D4A_Ranked', 'NatWalkInd', 'Households', 'Workers_1', 'Residents', 'Drivers', 'White', 'Male', 'Lowwage', 'Medwage', 'Highwage', 'W_P_Lowwage', 'W_P_Medwage', 'W_P_Highwage', 'GasPrice', 'logd1a', 'logd1c', 'logd3aao', 'logd3apo', 'd4bo25', 'd5dei_1', 'logd4d', 'UPTpercap', 'B_C_constant', 'B_C_male', 'B_C_ld1c', 'B_C_drvmveh', 'B_C_ld1a', 'B_C_ld3apo', 'B_C_inc1', 'B_C_gasp', 'B_N_constant', 'B_N_inc2', 'B_N_inc3', 'B_N_white', 'B_N_male', 'B_N_drvmveh', 'B_N_gasp', 'B_N_ld1a', 'B_N_ld1c', 'B_N_ld3aao', 'B_N_ld3apo', 'B_N_d4bo25', 'B_N_d5dei', 'B_N_UPTpc', 'C_R_Households', 'C_R_Pop', 'C_R_Workers', 'C_R_Drivers', 'C_R_Vehicles', 'C_R_White', 'C_R_Male', 'C_R_Lowwage', 'C_R_Medwage', 'C_R_Highwage', 'C_R_DrmV', 'NonCom_VMT_Per_Worker', 'Com_VMT_Per_Worker', 'VMT_per_worker', 'VMT_tot_min', 'VMT_tot_max', 'VMT_tot_avg', 'GHG_per_worker', 'Annual_GHG', 'SLC_score']
['target']

Check correlations manually to detect weirdness/problems¶

In [ ]:
all_cols = []
delete_cols = []
random_elements = {}
threshold = 0.7
counter = 0

while counter < 2:

    all_cols.extend(delete_cols)
    all_cols.extend(random_elements)
    X_copy = model_data[input_cols].drop(columns=all_cols)
    corr_matrix = X_copy.corr()
    
    # Find pairs with high correlation
    high_corr_pairs = {}
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                high_corr_pairs[(corr_matrix.columns[i], corr_matrix.columns[j])] = corr_matrix.iloc[i, j]
    
    
    # Separate columns and count how often they appear
    high_corr_cols = list(high_corr_pairs.keys())  #need to unpack tuples and delete duplicates
    high_corr_cols = [value for item in high_corr_cols for value in item]  #<https://stackoverflow.com/questions/10632839/transform-list-of-tuples-into-a-flat-list-or-a-matrix>
    counter_dict = Counter(high_corr_cols)
    keys_delete = counter_dict.keys()
    keys_delete = list(set(keys_delete))
    high_corr_pairs_copy = {k: v for k, v in high_corr_pairs.items() if all(i in keys_delete for i in k)}
    delete_cols = [tool for tool, value in counter_dict.items() if value > 1]
    if high_corr_pairs_copy:
        for key in high_corr_pairs_copy:
            random_elements = [random.choice(key) for key in high_corr_pairs_copy]
    print(counter_dict)
    print(delete_cols)
    print(high_corr_pairs_copy)
    print(random_elements)
    #Update Counter
    if not delete_cols or random_elements:
        counter += 1
    
X_copy.columns.tolist()
In [14]:
# Check again
corr_matrix = X_copy.corr()
threshold = 0.7

# Find pairs with high correlation
high_corr_pairs = {}
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            high_corr_pairs[(corr_matrix.columns[i], corr_matrix.columns[j])] = corr_matrix.iloc[i, j]   
            
# Print something if high correlation pairs are/aren't found 
if high_corr_pairs:
    print(high_corr_pairs)
    # Print out high correlation pairs
    print("Pairs with high correlation (|correlation| > {:.2f}):".format(threshold))
    for pair, corr_value in high_corr_pairs.items():
        print(f"{pair[0]} and {pair[1]}: {corr_value:.2f}")
else:
    print("There are no pairs with a correlation above the threshold: (|correlation| > {:.2f}):".format(threshold))
There are no pairs with a correlation above the threshold: (|correlation| > 0.70):
In [19]:
# Grab low corr columns and try to get rid of county-level variables
low_corr_cols = X_copy.columns.tolist()
county_lvl_cols = [s for s in low_corr_cols if s.startswith("C_R") or s.startswith("B_N") or s.startswith("B_C") or s.startswith("VMT_tot")]
county_lvl_cols.extend(('R_PCTLOWWAGE','GasPrice', 'UPTpercap', 'Ac_Water', 'D1_FLAG', 'd4bo25', 'd5dei_1', 'D4A_Ranked','D4B050' ))  # UPTpercap, Ac_water, D1_FLAG
low_corr_cols = [x for x in low_corr_cols if x not in county_lvl_cols]

# Encode and reshape data
X = model_data[low_corr_cols]
y = model_data[target_col]
num_target = LabelEncoder()
y = num_target.fit_transform(y).flatten()

#Check results
print(county_lvl_cols)
print(X.shape)
print(y.shape)
['B_C_male', 'B_C_ld1c', 'B_C_ld1a', 'B_C_ld3apo', 'B_C_inc1', 'B_N_drvmveh', 'B_N_ld3apo', 'C_R_White', 'C_R_Male', 'C_R_DrmV', 'VMT_tot_max', 'VMT_tot_avg', 'R_PCTLOWWAGE', 'GasPrice', 'UPTpercap', 'Ac_Water', 'D1_FLAG', 'd4bo25', 'd5dei_1', 'D4A_Ranked', 'D4B050']
(13471, 33)
(13471,)
c:\Users\rdn91\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\preprocessing\_label.py:116: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

EDA¶

In [ ]:
# Check for nulls in any column for each row
X[(X.isna()).all(axis=1)]
Out[ ]:
Ac_Land Ac_Unpr P_WrkAge AutoOwn0 Pct_AO1 E5_Ret E8_Ent E8_Ed E8_Hlth D1C5_ENT D1C8_RET D1C8_IND D1C8_HLTH D2A_JPHH D2C_TRIPEQ D2A_WRKEMP D3AMM D3BMM3 D3BMM4 D3BPO4 D4A D4C D4D D4E D5BE D5CEI NatWalkInd White W_P_Medwage logd1c logd4d NonCom_VMT_Per_Worker
In [10]:
# Check for any negative values that don't make sense (column-wise) for each row
X[(X < 0).all(axis=1)]
Out[10]:
Ac_Land Ac_Unpr P_WrkAge AutoOwn0 Pct_AO1 E5_Ret E8_Ent E8_Ed E8_Hlth D1C5_ENT D1C8_RET D1C8_IND D1C8_HLTH D2A_JPHH D2C_TRIPEQ D2A_WRKEMP D3AMM D3BMM3 D3BMM4 D3BPO4 D4A D4C D4D D4E D5BE D5CEI NatWalkInd White W_P_Medwage logd1c logd4d NonCom_VMT_Per_Worker
In [123]:
X.columns
Out[123]:
Index(['Ac_Total', 'Ac_Unpr', 'P_WrkAge', 'AutoOwn0', 'Pct_AO2p', 'E5_Ret',
       'E8_Ent', 'E8_Hlth', 'D1C8_RET', 'D1C8_IND', 'D1C8_ENT', 'D1C8_ED',
       'D1C8_HLTH', 'D1C8_PUB', 'D2A_JPHH', 'D2C_TRIPEQ', 'D2A_WRKEMP',
       'D3AMM', 'D3BMM3', 'D3BMM4', 'D3BPO3', 'D3BPO4', 'D4A', 'D4C', 'D4D',
       'D4E', 'D5BE', 'D5CEI', 'NatWalkInd', 'White', 'W_P_Medwage', 'logd1c',
       'logd4d', 'NonCom_VMT_Per_Worker'],
      dtype='object')
In [101]:
plot_cols(X, X.columns, 4, "histplot")
No description has been provided for this image
In [102]:
plot_cols(X, X.columns, 4, "kdeplot")
No description has been provided for this image
In [103]:
X.describe()
Out[103]:
Ac_Land Ac_Unpr P_WrkAge AutoOwn0 Pct_AO2p E5_Ret E5_Ent E8_Hlth D1C5_RET D1C5_IND D1C5_ENT D1C8_ED D1C8_HLTH D2A_JPHH D2C_TRIPEQ D2A_WRKEMP D3AMM D3BMM3 D3BMM4 D3BPO4 D4A D4C D4D D4E D5BR D5CRI NatWalkInd White W_P_Medwage logd1c logd4d NonCom_VMT_Per_Worker
count 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 1.347100e+04 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 1.347100e+04 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000 13471.000000
mean 339.664921 250.658867 0.620634 48.916784 0.593864 82.285502 100.380744 124.242150 0.632095 0.737723 0.929933 0.523923 1.232386 2.208971 4.729318e-01 6.017315 3.878526 23.160982 14.113828 35.506695 460.697881 8.727682 97.765096 0.007959 1.789674e+05 0.495927 14.187860 522.830970 0.366496 1.336787 2.945438 2.692507
std 4116.565820 1310.140462 0.092262 82.042930 0.197057 252.685108 399.282251 459.071392 1.961353 2.917049 3.652411 4.896970 5.135792 27.981841 2.803600e-01 11.105327 3.499955 27.653597 20.140445 45.755758 256.464434 14.407550 299.399298 0.065520 2.112026e+05 0.217665 2.251361 479.981067 0.109384 0.916786 1.777173 1.158195
min 3.806823 3.806823 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.312035e-198 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.330000 0.000518 0.000000 5.000000e+00 0.000000 5.666667 0.000000 0.000000 0.000000 0.000000 0.420682
25% 64.009278 62.491096 0.567000 8.000000 0.464562 0.000000 0.000000 17.000000 0.000000 0.029345 0.000000 0.000000 0.139561 0.193846 2.859821e-01 1.086739 1.450393 2.039375 0.000000 7.864765 274.930000 2.000000 8.747939 0.001367 2.466050e+04 0.352438 12.666667 150.000000 0.296687 0.632475 2.000000 1.878572
50% 106.410403 102.778227 0.616000 25.000000 0.612903 13.000000 18.000000 37.000000 0.111695 0.123894 0.149405 0.003743 0.362378 0.452349 4.870922e-01 2.900000 3.227179 15.239382 7.966047 20.327148 426.480000 4.670000 25.189643 0.003074 9.063600e+04 0.522043 14.166667 414.000000 0.370327 1.159634 3.000000 2.443149
75% 183.090845 174.740024 0.670000 60.000000 0.744975 63.000000 84.000000 89.000000 0.538239 0.455436 0.708771 0.365651 0.902096 1.157211 6.777726e-01 6.929154 5.413592 33.179379 18.999462 45.465900 618.260000 10.000000 71.899564 0.007014 2.753380e+05 0.648365 15.833333 762.000000 0.437679 1.859992 4.000000 3.319817
max 407868.018652 61946.753520 1.000000 1710.000000 1.000000 5780.000000 27592.000000 12572.000000 84.893696 121.727268 139.624405 376.807057 326.152342 1862.888889 9.995461e-01 414.000000 41.753275 357.134925 432.828295 671.062217 1207.000000 402.330000 17191.471949 5.734000 1.193711e+06 1.000000 20.000000 6371.000000 1.000000 6.644192 10.000000 10.911256

Feature Selection¶

In [104]:
print(X.shape, y.shape)
(13471, 32) (13471,)
In [105]:
# Split data into training, validation, and test --> 70-15-15 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, stratify=y) 
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp)
In [106]:
print("Original data shape:", X.shape, "\nOutput:", y.shape, "\n")
print("Shape of training dataset: ", "\n", "Input Features:", X_train.shape, "\n", "Output:",  y_train.shape)
print("\nShape of validation dataset: ", "\n", "Input Features:", X_validation.shape, "\n", "Output:",  y_validation.shape)
print("\nShape of testing dataset: ", "\n", "Input Features:", X_test.shape, "\n", "Output:",  y_test.shape)
Original data shape: (13471, 32) 
Output: (13471,) 

Shape of training dataset:  
 Input Features: (9429, 32) 
 Output: (9429,)

Shape of validation dataset:  
 Input Features: (2021, 32) 
 Output: (2021,)

Shape of testing dataset:  
 Input Features: (2021, 32) 
 Output: (2021,)
In [20]:
# Scale the data
scaler = StandardScaler().set_output(transform="pandas")
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)
In [108]:
# Permutation Importants using Random Forest
rf_feature = RandomForestClassifier(n_estimators=100)
rf_feature.fit(X_scaled, y)
importances = rf_feature.feature_importances_
#std = np.std([RF.feature_importances_ for tree in RF.estimators_], axis=0)
idf = pd.DataFrame({'feature': X_scaled.columns, 'importance': importances})
#idf = pd.DataFrame({'feature': X_train.columns, 'importance': importances, 'std': std})
idf = idf.sort_values('importance',ascending=False)
print(idf)
ax = idf.plot(kind='barh', x='feature', legend=False)
ax.set_ylabel('')
                  feature  importance
31  NonCom_VMT_Per_Worker    0.248721
25                  D5CRI    0.058175
24                   D5BR    0.052645
4                Pct_AO2p    0.038953
27                  White    0.038428
22                    D4D    0.038009
1                 Ac_Unpr    0.035215
0                 Ac_Land    0.033485
19                 D3BPO4    0.030505
28            W_P_Medwage    0.027864
29                 logd1c    0.027232
26             NatWalkInd    0.024348
12              D1C8_HLTH    0.024054
21                    D4C    0.022016
23                    D4E    0.021928
2                P_WrkAge    0.021344
16                  D3AMM    0.020406
20                    D4A    0.020170
3                AutoOwn0    0.020124
15             D2A_WRKEMP    0.019464
9                D1C5_IND    0.019160
7                 E8_Hlth    0.017141
17                 D3BMM3    0.016151
13               D2A_JPHH    0.015960
10               D1C5_ENT    0.015623
14             D2C_TRIPEQ    0.015366
18                 D3BMM4    0.015231
8                D1C5_RET    0.013872
30                 logd4d    0.013360
6                  E5_Ent    0.012291
5                  E5_Ret    0.011754
11                D1C8_ED    0.011006
Out[108]:
Text(0, 0.5, '')
No description has been provided for this image
In [110]:
# Testing Random Forest Feature Importance
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_scaled, y)
importances_rf = rf_model.feature_importances_

# Select the top 5 features based on feature importance
top_5_rf = np.argsort(importances_rf)[-5:]  # Get indices of top 5 features
selected_features_rf = X_scaled.columns[top_5_rf]
X_scaled = X_scaled[selected_features_rf]
X_scaled
Out[110]:
White Pct_AO2p D5BR D5CRI NonCom_VMT_Per_Worker
0 -0.414263 1.673694 -0.831197 -0.115725 1.635124
1 2.771398 0.846390 -0.846150 0.113568 3.120810
2 1.583807 -0.006452 -0.845662 0.063699 3.777755
3 -0.428847 1.085152 -0.846528 -0.034688 0.849058
4 1.446296 0.818724 -0.846637 -0.043519 1.137360
... ... ... ... ... ...
13466 -0.124657 0.200302 -0.701804 -1.922919 -0.145962
13467 0.119111 0.624003 -0.554665 -0.415139 -0.612506
13468 0.083692 1.415196 -0.511691 -0.861541 -0.386401
13469 0.171199 1.350357 -0.569950 -1.045051 -0.193661
13470 0.092026 0.511127 -0.521359 -0.773360 -0.424307

13471 rows × 5 columns

Supervised Learning¶

In [109]:
# Do K-Fold Cross Validation
pipeline = Pipeline ([
    ('smote',  BorderlineSMOTE(sampling_strategy='not majority', m_neighbors=100, k_neighbors=100)),
    ('scale', StandardScaler()),
    ('rf', RandomForestClassifier())  #no combination of hyperparameters led to good increase in f1 score, just focus on feature engineering and not hyperparameter tuning
])
param_grid = {
              }
cv = StratifiedKFold(n_splits=5)
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1,scoring='f1_weighted', refit=True)
grid_data = grid.fit(X_train, y_train)
results = pd.DataFrame(grid_data.cv_results_)
results.sort_values(by=['rank_test_score'])
Out[109]:
mean_fit_time std_fit_time mean_score_time std_score_time params split0_test_score split1_test_score split2_test_score split3_test_score split4_test_score mean_test_score std_test_score rank_test_score
0 11.355697 0.150327 0.028233 0.004922 {} 0.858138 0.861453 0.844702 0.859861 0.860895 0.85701 0.006256 1
In [111]:
# Do K-Fold Cross Validation
pipeline = Pipeline ([
    ('smote',  BorderlineSMOTE(sampling_strategy='not majority', m_neighbors=100, k_neighbors=100)),
    ('scale', StandardScaler()),
    ('xgb', XGBClassifier(objective='multi:softprob', colsample_bytree=0.9, eta=0.6))  #no combination of hyperparameters led to good increase in f1 score, just focus on feature engineering and not hyperparameter tuning
])
param_grid = {
              }
cv = StratifiedKFold(n_splits=5)
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1,scoring='f1_weighted', refit=True)
grid_data = grid.fit(X_train, y_train)
results = pd.DataFrame(grid_data.cv_results_)
results.sort_values(by=['rank_test_score'])
Out[111]:
mean_fit_time std_fit_time mean_score_time std_score_time params split0_test_score split1_test_score split2_test_score split3_test_score split4_test_score mean_test_score std_test_score rank_test_score
0 3.107797 0.032659 0.01853 0.001632 {} 0.89548 0.911367 0.898752 0.912347 0.908622 0.905314 0.006882 1
In [112]:
#<https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html>
# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix, without normalization", None),
    ("Normalized confusion matrix", "true"),  # true --> sum of true values (rowwise total sum)
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        grid_data,
        X_validation,
        y_validation,
        display_labels=num_target.inverse_transform([0,1,2,3]),
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)
    fig = disp.figure_
    fig.set_figwidth(10)
    fig.set_figheight(10) 
    print(title)
    print(disp.confusion_matrix)

plt.show()



# F1-score
y_pred = grid_data.predict(X_validation)
print(classification_report(y_validation, y_pred, digits = 4))
Confusion matrix, without normalization
[[ 328   32    1   16]
 [  28  212    4   17]
 [   0    3   32    2]
 [  19   31    6 1290]]
Normalized confusion matrix
[[0.87002653 0.08488064 0.00265252 0.04244032]
 [0.10727969 0.81226054 0.01532567 0.0651341 ]
 [0.         0.08108108 0.86486486 0.05405405]
 [0.0141159  0.0230312  0.00445765 0.95839525]]
No description has been provided for this image
No description has been provided for this image
              precision    recall  f1-score   support

           0     0.8747    0.8700    0.8723       377
           1     0.7626    0.8123    0.7866       261
           2     0.7442    0.8649    0.8000        37
           3     0.9736    0.9584    0.9659      1346

    accuracy                         0.9213      2021
   macro avg     0.8388    0.8764    0.8562      2021
weighted avg     0.9237    0.9213    0.9223      2021

In [ ]:
# Test data
titles_options = [
    ("Confusion matrix, without normalization", None),
    ("Normalized confusion matrix", "true"),  # true --> sum of true values (rowwise total sum)
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        grid_data,
        X_test,
        y_test,
        display_labels=num_target.inverse_transform([0,1,2,3]),
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)
    fig = disp.figure_
    fig.set_figwidth(10)
    fig.set_figheight(10) 
    print(title)
    print(disp.confusion_matrix)

plt.show()



# F1-score
y_pred = grid_data.predict(X_test)
print(classification_report(y_test, y_pred, digits = 4))
In [278]:
# What does each label mean
print(y.dtype)
print(num_target.inverse_transform([0,1,2,3]))
print(Counter(y))
int32
['Bay Area' 'Central California' 'Northern California'
 'Southern California']
Counter({3: 8968, 0: 2517, 1: 1737, 2: 249})

Clustering¶

Dimensionality Reduction/Feature Extraction Methods¶

In [24]:
# For graphing 2d 
def plot_scatter_with_adjusted_labels(data, method_name,y):
    
    plt.figure(figsize=(10, 7))

    label_dict = {0: 'Bay Area', 
                  1: 'Central California', 
                  2: 'Northern California', 
                  3: 'Southern California'
                  }
    
    scatter = plt.scatter(data[:, -1], data[:, :-1], c=y, cmap='viridis')
    plt.title(f'{method_name} Scatter Plot with First Two Components')
    plt.xlabel(f'First {method_name} Dimension')
    plt.ylabel(f'Second {method_name} Dimension')
    
    # Adding all labels, with adjustText for avoiding overlap
    #texts = [plt.text(data[i, 0], data[i, 1], str(y.iloc[i]), fontsize=9) for i in range(len(y))]
    
    # Adjust the text labels to avoid overlaps
    #adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))

    plt.colorbar(scatter)

    unique_labels = np.unique(y)
    handles = [plt.Line2D([0], [0], marker='o', color='w', 
                          markerfacecolor=scatter.cmap(scatter.norm(label)), markersize=10) 
               for label in unique_labels]
    
    plt.legend(handles, [label_dict[label] for label in unique_labels], title="Regions", loc="best")
    plt.show()
In [62]:
# For graphing 3d
def plot_scatter_with_adjusted_labels(data, method_name, y):
    fig = go.Figure(data=[go.Scatter3d(
        x=data[:, 0],
        y=data[:, 1],
        z=data[:, 2],
        mode='markers',
        marker=dict(color=y, 
                    colorscale='Viridis', 
                    showscale=True, 
                    colorbar=dict(
                        title='Region',
                        ticktext=['Bay Area', 'Central California', 'Northern California','Southern California'],
                        tickvals=[0,1,2,3],
                        ticks='outside'
                        )
                    ),
        text=y
    )])

    fig.update_layout(
        title=f'{method_name} Scatter Plot with First Three Components',
        scene=dict(
            xaxis_title=f'First {method_name} Dimension',
            yaxis_title=f'Second {method_name} Dimension',
            zaxis_title=f'Third {method_name} Dimension'
        ),
        width=1200,
        height=800
    )

    fig.update_layout(
        dragmode='drawrect',
    )


    

    fig.show(config=dict(displayModeBar=True))

Undersampling/Oversampling¶

In [21]:
over_sample = BorderlineSMOTE(sampling_strategy='not majority', m_neighbors=200, k_neighbors=200)
X_scaled_copy, y_copy = over_sample.fit_resample(X_scaled, y)
print(Counter(y_copy))
Counter({0: 8968, 3: 8968, 1: 8968, 2: 8968})

PCA¶

In [126]:
pca_result = PCA(n_components=2).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(pca_result, 'PCA',y_copy)
No description has been provided for this image

MDS¶

In [ ]:
mds_result = MDS(n_components=2, random_state=10, n_jobs=-1).fit_transform(X)
plot_scatter_with_adjusted_labels(mds_result, 'MDS',y)

Takes too long

IsoMap¶

In [ ]:
isomap_result = Isomap(n_neighbors=500,n_components=2).fit_transform(X)
plot_scatter_with_adjusted_labels(isomap_result, 'Isomap',y)

Takes too long

LLE¶

Will take too long

t-SNE (Oversampling)¶

In [16]:
# High corr threshold with 'not-majority' sampling 
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
No description has been provided for this image
In [288]:
# Medium corr threshold with 'not-majority' sampling 
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
No description has been provided for this image
In [294]:
# Low corr threshold with 'not-majority' sampling 
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
No description has been provided for this image

BorderlineSmote t-SNE (Med. Corr Threshold)¶

In [25]:
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10, n_jobs=-1).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
No description has been provided for this image

BorderlineSmote t-SNE (Med. Corr Threshold and Top 5 Features after Random Forest)¶

In [36]:
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10, n_jobs=-1).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
No description has been provided for this image

t-SNE (Undersampling)¶

In [300]:
# Low corr threshold with 'not-minority' sampling 
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
No description has been provided for this image
In [306]:
# Medium corr threshold with 'not-minority' sampling 
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
No description has been provided for this image
In [312]:
# High corr threshold with 'not-minority' sampling 
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
No description has been provided for this image
In [316]:
# Testing
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
No description has been provided for this image

t-SNE on original data (few columns vs a lot)¶

In [267]:
# Low correlation threshold (0.5)
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y)
No description has been provided for this image
In [284]:
# Med correlation threshold (0.7)
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y)
No description has been provided for this image
In [272]:
# High correlation threshold (0.9)
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y)
No description has been provided for this image

Clustering (Agglomerative and Hierarchical)¶

In [31]:
dbscan = DBSCAN(eps=0.00001,metric='euclidean', min_samples=1)
labels = dbscan.fit_predict(tsne_result)

fig, ax = plt.subplots(5, 2, figsize=(30, 40))

# Dimensionality reduction plot
og_plot = ax[0,0].scatter(tsne_result[:, -1], tsne_result[:, :-1], c=y_copy, cmap='viridis')
ax[0,0].set_title('Data after tsne and Borderline Smote using original labels')
plt.colorbar(og_plot, ax=ax[0,0])
label_dict = {0: 'Bay Area', 
                  1: 'Central California', 
                  2: 'Northern California', 
                  3: 'Southern California'
                  }
    
unique_labels = np.unique(y)
handles = [plt.Line2D([0], [0], marker='o', color='w', 
            markerfacecolor=og_plot.cmap(og_plot.norm(label)), markersize=10) for label in unique_labels]
    
ax[0,0].legend(handles, [label_dict[label] for label in unique_labels], title="Regions", loc="best")


# DBscan plot
cluster_plot = ax[0,1].scatter(tsne_result[:, -1], tsne_result[:, :-1], c=labels, cmap='viridis')
ax[0,1].set_title('Clustering with DBscan')
plt.colorbar(cluster_plot, ax=ax[0,1])


#Filtering greenish points
mask_green = (y_copy == 2)
green_plot = ax[1,0].scatter(tsne_result[mask_green, -1], tsne_result[mask_green, :-1], c='#22a884')
ax[1,0].set_title('Filtering the green values (Northern California)')


#Filtering yellowish points
mask_yellow = (y_copy == 3)
yellow_plot = ax[2,0].scatter(tsne_result[mask_yellow, -1], tsne_result[mask_yellow, :-1], c='#fde725')
ax[2,0].set_title('Filtering the yellow values (Southern California)')



#Filtering purpleish points
mask_purple = (y_copy == 0)
purple_plot = ax[3,0].scatter(tsne_result[mask_purple, -1], tsne_result[mask_purple, :-1], c='#440154')
ax[3,0].set_title('Filtering the purple values (Bay Area)')



#Filtering blueish points
mask_blue = (y_copy == 1)
blue_plot = ax[4,0].scatter(tsne_result[mask_blue, -1], tsne_result[mask_blue, :-1], c='#2a788e')
ax[4,0].set_title('Filtering the blue values (Central California)')




#Compare Central California and Bay Area
purple_plot = ax[4,1].scatter(tsne_result[mask_purple, -1], tsne_result[mask_purple, :-1], c='#440154')
blue_plot = ax[4,1].scatter(tsne_result[mask_blue, -1], tsne_result[mask_blue, :-1], c='#2a788e')
ax[4,1].set_title('Comparing Central California and Bay Area')



#Compare Northern California and Central California
green_plot = ax[1,1].scatter(tsne_result[mask_green, -1], tsne_result[mask_green, :-1], c='#22a884')
blue_plot = ax[1,1].scatter(tsne_result[mask_blue, -1], tsne_result[mask_blue, :-1], c='#2a788e')
ax[1,1].set_title('Comparing Central and Northern California')


#Compare Southern California and Bay Area
purple_plot = ax[2,1].scatter(tsne_result[mask_purple, -1], tsne_result[mask_purple, :-1], c='#440154')
yellow_plot = ax[2,1].scatter(tsne_result[mask_yellow, -1], tsne_result[mask_yellow, :-1], c='#fde725')
ax[2,1].set_title('Comparing Southern California and the Bay Area')


# Compare Bay Area and Northern California
purple_plot = ax[3,1].scatter(tsne_result[mask_purple, -1], tsne_result[mask_purple, :-1], c='#440154')
green_plot = ax[3,1].scatter(tsne_result[mask_green, -1], tsne_result[mask_green, :-1], c='#22a884')
ax[3,1].set_title('Comparing Bay Area and Northern California')


plt.show()



#'dice', 'precomputed', 'cityblock', 'hamming', 'yule', 'rogerstanimoto', 'correlation', 'mahalanobis', 'manhattan', 'l2', 'russellrao', 'sokalmichener',
#  'nan_euclidean', 'canberra', 'matching', 'cosine', 'chebyshev', 'sokalsneath', 'wminkowski', 'haversine', 'minkowski', 'kulsinski', 'sqeuclidean', 
# 'braycurtis', 'jaccard', 'seuclidean', 'l1', 'euclidean
No description has been provided for this image
In [129]:
fig, ax = plt.subplots(2, 2, figsize=(30, 20))

# DBscan plot
cluster_plot = ax[0,0].scatter(tsne_result[:, -1], tsne_result[:, :-1], c=labels, cmap='viridis')
ax[0,0].set_title('Clustering with DBscan')
plt.colorbar(cluster_plot, ax=ax[0,0])

# Form 2 clusters
mask_blue_greens = (labels >= 0) & (labels <=30000)
blue_greens_plot = ax[0,1].scatter(tsne_result[mask_blue_greens, -1], tsne_result[mask_blue_greens, :-1], c='#008B8B')
ax[0,1].set_title('Combining Bay Area, Central California, and Southern California')



# Plot other cluster
mask_yellow2 = (labels > 30000) 
yellow2_plot = ax[1,1].scatter(tsne_result[mask_yellow2, -1], tsne_result[mask_yellow2, :-1], c='#fde725')
ax[1,1].set_title('Northern California')

plt.show()
No description has been provided for this image
In [130]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(tsne_result)
labels = kmeans.labels_
plt.scatter(tsne_result[:, -1], tsne_result[:, :-1], c=labels)
plt.title("K-Means Clustering")
plt.show()
c:\Users\rdn91\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
No description has been provided for this image
In [131]:
model = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward')
model.fit(tsne_result)
labels = model.labels_

plt.scatter(tsne_result[:, -1], tsne_result[:, :-1], c=labels, cmap='viridis')
plt.title('Agglomerative Clustering')
plt.show()
No description has been provided for this image

Findings¶

Based on the scatter plot of the data after dimensionality reduction but before "clustering", I have some sort of intuition that there are kinda 2 clusters. For one cluster, we have Central California, Bay Area, and Southern California. The other cluster being primarily composed of Northern California. This result isn't really that surpising since we don't expect the census block groups (parts of a city) to be quite different from each other for different regions of California. Even though the cluster that contains the 3 California regions seems quite densely packed, there does seem to be some obvious patterns.


However, there is a major thing that needs to be remembered. There is an element of randomness for the feature selection and dimensionality reduction method. What this means is that each time we run the code, the results will not exactly be the same. However, I found that there were few cases where the general/global struture/shape of the results were totally different. My feature selection process is sort of like a pseudo-forward selection process using a correlation threshold.

Finding Patterns between Central California, Bay Area, and Southern California (Unsupervised + Supervised Learning)¶

image.png

After applying BorderlineSmote, I thought Central and Northern California wouldn't be that close knit as they are.
Central California seems to have a more diverse structure to it compared to Northern California.
Other than that, this results is not that surpising.

image.png

The data for Southern California are more dense and are "contained" inside the Bay Area structure.
However, I don't think I can say more in terms of quantitative patterns because it is not appropriate to give such meaning to data that has been through standardization, dimensionality reduction, and oversampling.
This plot goes with the general intuition that Southern California and Bay Area aren't that different in terms of what a typical "city" looks like besides the Bay Area being close to water.

image.png

This results aligns exactly as one might think. There is a reason why Bay Area is given its own region and not combined with another region.

image.png

Funny how I just said how the Bay Area should be given its own region but the plot does show that the census block groups for the Bay Area aren't quite as spread out. It's important to note that we only selected a certain subset of the available variables so the interpretation might change if I were to rerun all the code again.

Conclusion¶

I don't think that applying feature selection, dimensionality reduction, oversampling, and clustering will yield "significant" results when it comes to finding patterns involving census block groups from different California regions. However, just because I did not find a significant results, doesn't mean I should keep discard what the plots show. The clustering/plots affirm what we think about California regions and how different areas compare to one another. I think using various descriptive statistical techniques will bring about more impactful/insightful findings for this particular domain.

Finding Patterns using Descriptive Statistics¶

In [101]:
model_data.head()
Out[101]:
GEOID10 GEOID20 STATEFP COUNTYFP TRACTCE BLKGRPCE CSA CSA_Name CBSA CBSA_Name CBSA_POP CBSA_EMP CBSA_WRK Ac_Total Ac_Water Ac_Land Ac_Unpr TotPop CountHU HH P_WrkAge AutoOwn0 Pct_AO0 AutoOwn1 Pct_AO1 AutoOwn2p Pct_AO2p Workers R_LowWageWk R_MedWageWk R_HiWageWk R_PCTLOWWAGE TotEmp E5_Ret E5_Off E5_Ind E5_Svc E5_Ent E8_Ret E8_off E8_Ind E8_Svc E8_Ent E8_Ed E8_Hlth E8_Pub E_LowWageWk E_MedWageWk E_HiWageWk E_PctLowWage D1A D1B D1C D1C5_RET D1C5_OFF D1C5_IND D1C5_SVC D1C5_ENT D1C8_RET D1C8_OFF D1C8_IND D1C8_SVC D1C8_ENT D1C8_ED D1C8_HLTH D1C8_PUB D1D D1_FLAG D2A_JPHH D2B_E5MIX D2B_E5MIXA D2B_E8MIX D2B_E8MIXA D2A_EPHHM D2C_TRPMX1 D2C_TRPMX2 D2C_TRIPEQ D2R_JOBPOP D2R_WRKEMP D2A_WRKEMP D2C_WREMLX D3A D3AAO D3AMM D3APO D3B D3BAO D3BMM3 D3BMM4 D3BPO3 D3BPO4 D4A D4B025 D4B050 D4C D4D D4E D5AR D5AE D5BR D5BE D5CR D5CRI D5CE D5CEI D5DR D5DRI D5DE D5DEI D2A_Ranked D2B_Ranked D3B_Ranked D4A_Ranked NatWalkInd Households Workers_1 Residents Drivers White Male Lowwage Medwage Highwage W_P_Lowwage W_P_Medwage W_P_Highwage GasPrice logd1a logd1c logd3aao logd3apo d4bo25 d5dei_1 logd4d UPTpercap B_C_constant B_C_male B_C_ld1c B_C_drvmveh B_C_ld1a B_C_ld3apo B_C_inc1 B_C_gasp B_N_constant B_N_inc2 B_N_inc3 B_N_white B_N_male B_N_drvmveh B_N_gasp B_N_ld1a B_N_ld1c B_N_ld3aao B_N_ld3apo B_N_d4bo25 B_N_d5dei B_N_UPTpc C_R_Households C_R_Pop C_R_Workers C_R_Drivers C_R_Vehicles C_R_White C_R_Male C_R_Lowwage C_R_Medwage C_R_Highwage C_R_DrmV NonCom_VMT_Per_Worker Com_VMT_Per_Worker VMT_per_worker VMT_tot_min VMT_tot_max VMT_tot_avg GHG_per_worker Annual_GHG SLC_score COUNTY_NAME target
0 60971513084 60971513084 6 97 151308 4 488.0 San Jose-San Francisco-Oakland, CA 42220.0 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 67.135587 0.0 67.135587 60.291730 396 169 131 0.490 0 0.000000 10 0.076336 121 0.923664 276 46 78 152 0.166667 30 4 0 16 10 0 4 0 16 9 0 0 1 0 6 9 15 0.200000 2.803038 6.568065 0.497581 0.066344 0.000000 0.265376 0.165860 0.000000 0.066344 0.000000 0.265376 0.149274 0.000000 0.000000 0.016586 0.0 3.300619 0.0 0.229008 0.883037 0.602767 0.777957 0.518638 0.477267 0.397559 0.395591 9.525833e-02 0.140845 0.196078 9.200000 2.746536e-04 22.454792 0.000000 0.064477 22.390315 149.409879 0.000000 0.000000 0.000000 181.125994 28.598841 615.57 0.00000 0.000000 0.67 6.387075 0.001692 20817.0 23238.0 3423.0 4467.0 0.002432 0.470739 0.002547 0.504417 0.001056 0.058551 0.001686 0.098464 9.0 9.0 17.0 15.0 13.666667 160 276 464 273.68 324 244 46 78 152 0.200000 0.300000 0.500000 342 1.335800 0.403851 0.00000 3.152322 0 0 0 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 4.586229 36.685037 41.271266 18.791349 72.049792 39.60906 36.772698 9560.901540 57.790885 Sonoma County Bay Area
1 60971513092 60971513092 6 97 151309 2 488.0 San Jose-San Francisco-Oakland, CA 42220.0 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 570.037214 0.0 570.037214 471.335950 2560 896 869 0.593 8 0.009206 200 0.230150 661 0.760644 1406 289 358 759 0.205548 63 1 0 24 36 2 1 0 24 13 2 5 18 0 20 19 24 0.317460 1.900980 5.431370 0.133663 0.002122 0.000000 0.050919 0.076379 0.004243 0.002122 0.000000 0.050919 0.027581 0.004243 0.010608 0.038189 0.0 2.034642 0.0 0.072497 0.622319 0.536037 0.796767 0.686537 0.189938 0.234089 0.244020 1.481149e-03 0.048037 0.085773 22.317460 5.520070e-10 9.245028 1.434447 1.089662 6.720919 42.302361 4.490935 1.122734 2.245467 37.050213 14.595538 742.98 0.00000 0.000000 0.67 0.752232 0.000262 23024.0 26141.0 265.0 2111.0 0.002689 0.520646 0.002865 0.567431 0.000082 0.004533 0.000797 0.046532 3.0 15.0 9.0 14.0 10.666667 966 1406 2882 2041.60 1853 1286 289 358 759 0.317460 0.301587 0.380952 342 1.065048 0.125454 0.88972 2.043933 0 0 1 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 6.306880 45.213616 51.520496 18.791349 72.049792 39.60906 45.904762 11935.238176 38.546556 Sonoma County Bay Area
2 60971513093 60971513093 6 97 151309 3 488.0 San Jose-San Francisco-Oakland, CA 42220.0 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 982.997406 0.0 982.997406 982.846330 1610 712 648 0.574 77 0.118827 187 0.288580 384 0.592593 665 149 171 345 0.224060 282 0 0 31 245 6 0 0 31 5 6 61 179 0 83 104 95 0.294326 0.724427 1.638099 0.286922 0.000000 0.000000 0.031541 0.249276 0.006105 0.000000 0.000000 0.031541 0.005087 0.006105 0.062065 0.182124 0.0 1.011348 0.0 0.435185 0.406720 0.277630 0.631164 0.488506 0.540334 0.441643 0.477021 9.053520e-01 0.298097 0.595565 2.358156 2.571345e-01 6.367728 0.795925 0.849893 4.721909 21.711227 1.302140 4.557489 0.000000 22.136376 3.906419 1098.38 0.00000 0.000000 0.67 0.436217 0.000416 22544.0 25770.0 368.0 1478.0 0.002633 0.509792 0.002824 0.559378 0.000114 0.006295 0.000558 0.032579 11.0 8.0 7.0 13.0 9.833333 660 665 1722 1156.32 1283 647 149 171 345 0.294326 0.368794 0.336879 342 0.544895 0.252253 0.58552 1.744303 0 0 0 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 7.067722 45.780375 52.848097 18.791349 72.049792 39.60906 47.087654 12242.790059 36.053805 Sonoma County Bay Area
3 60971513101 60971513101 6 97 151310 1 488.0 San Jose-San Francisco-Oakland, CA 42220.0 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 51.480814 0.0 51.480814 51.080456 506 199 182 0.709 0 0.000000 35 0.192308 147 0.807692 268 63 73 132 0.235075 142 14 0 8 19 101 14 0 8 13 101 0 6 0 96 37 9 0.676056 3.895815 9.905941 2.779928 0.274077 0.000000 0.156616 0.371962 1.977273 0.274077 0.000000 0.156616 0.254500 1.977273 0.000000 0.117462 0.0 6.675743 0.0 0.780220 0.650603 0.560399 0.612247 0.473864 0.671536 0.565885 0.621614 5.384277e-01 0.438272 0.692683 1.887324 4.117562e-01 27.510900 0.000000 9.572344 17.938556 116.063433 0.000000 12.431816 24.863632 87.022711 24.863632 580.70 0.00000 0.479764 2.00 24.863632 0.003953 21597.0 25470.0 185.0 1195.0 0.002523 0.488377 0.002791 0.552866 0.000057 0.003164 0.000451 0.026341 15.0 7.0 15.0 15.0 13.666667 200 268 552 383.68 317 190 63 73 132 0.676056 0.260563 0.063380 342 1.588381 1.329705 0.00000 2.941200 0 0 3 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 3.675845 24.646966 28.322812 18.791349 72.049792 39.60906 25.235625 6561.262585 82.103377 Sonoma County Bay Area
4 60971513102 60971513102 6 97 151310 2 488.0 San Jose-San Francisco-Oakland, CA 42220.0 Santa Rosa-Petaluma, CA 501317.0 201244.0 219300.0 94.734433 0.0 94.734433 89.936999 1944 700 674 0.653 68 0.100890 97 0.143917 509 0.755193 502 93 166 243 0.185259 18 1 1 3 13 0 1 1 3 1 0 0 12 0 11 5 2 0.611111 7.783226 21.615131 0.200140 0.011119 0.011119 0.033357 0.144546 0.000000 0.011119 0.011119 0.033357 0.011119 0.000000 0.000000 0.133427 0.0 7.983366 0.0 0.026706 0.616612 0.531121 0.652815 0.505263 0.088743 0.129022 0.139861 1.035771e-07 0.018349 0.069231 27.888889 2.100409e-12 21.003141 0.000000 4.051075 16.952065 92.364304 0.000000 6.755727 13.511455 101.335910 6.755727 692.02 0.15082 0.597189 2.00 13.511455 0.001029 21512.0 25326.0 162.0 494.0 0.002513 0.486455 0.002775 0.549741 0.000050 0.002771 0.000186 0.010889 1.0 8.0 14.0 14.0 10.833333 636 502 1741 1295.36 1217 915 93 166 243 0.611111 0.277778 0.111111 342 2.172844 0.182438 0.00000 2.887705 0 0 3 8 0.119722 0.113615 -0.174782 -0.245813 0.099881 -0.238419 -0.654242 0.01211 1.963416 0.085305 0.132144 -0.164703 0.082742 -0.174723 0.000804 -0.180891 -0.180891 0.037875 -0.170452 -0.234661 -0.235263 -0.002019 189374 499772 219301 358126.56 355082 0.632332 0.488313 0.209589 0.301544 0.488862 0.016077 4.009743 33.743783 37.753526 18.791349 72.049792 39.60906 33.638392 8745.981842 64.395923 Sonoma County Bay Area
In [ ]:
model_data['COUNTY_NAME'].value_counts()
In [ ]:
# Check negative values and nulls again
numeric_cols = model_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
print(model_data.columns[model_data.isna().any()].tolist())
print(model_data[numeric_cols].columns[(model_data[numeric_cols] < 0 ).any()].tolist())
print(model_data[numeric_cols].columns[(model_data[numeric_cols] < 0 ).all()].tolist())
['CSA', 'CSA_Name', 'CBSA', 'CBSA_Name']
['B_C_male', 'B_C_ld1c', 'B_C_drvmveh', 'B_C_ld1a', 'B_C_ld3apo', 'B_C_inc1', 'B_C_gasp', 'B_N_inc2', 'B_N_inc3', 'B_N_white', 'B_N_drvmveh', 'B_N_gasp', 'B_N_ld1a', 'B_N_ld1c', 'B_N_ld3apo', 'B_N_d4bo25', 'B_N_d5dei', 'B_N_UPTpc', 'C_R_DrmV']
['B_N_white', 'B_N_drvmveh', 'B_N_ld1a', 'B_N_ld1c', 'B_N_ld3apo', 'B_N_d4bo25']

Let's find some patterns using a tier system (jobs, traffic, public transportation, population, walkability)¶

In [253]:
# Job Columns
job_cols = ['D1C8_OFF', 'D1C8_IND', 'D1C8_SVC', 'D1C8_HLTH', 'D1C8_PUB']  # office, industry, service, education, healthcare, public administration
model_data['D1C8_SUM'] = model_data[job_cols].sum(axis=1)
model_data['D1C8_SUM'] = model_data['D1C8_SUM'] / model_data['Ac_Unpr']
model_data[job_cols + ['D1C8_SUM', 'D2C_WREMLX']]
Out[253]:
D1C8_OFF D1C8_IND D1C8_SVC D1C8_HLTH D1C8_PUB D1C8_SUM D2C_WREMLX
0 0.000000 0.265376 0.149274 0.016586 0.0 0.007152 2.746536e-04
1 0.000000 0.050919 0.027581 0.038189 0.0 0.000248 5.520070e-10
2 0.000000 0.031541 0.005087 0.182124 0.0 0.000223 2.571345e-01
3 0.000000 0.156616 0.254500 0.117462 0.0 0.010348 4.117562e-01
4 0.011119 0.033357 0.011119 0.133427 0.0 0.002102 2.100409e-12
... ... ... ... ... ... ... ...
13466 0.613009 2.932155 1.277459 0.458685 0.0 0.022640 4.491367e-01
13467 0.565495 6.942588 2.201728 5.769423 0.0 0.013323 3.853825e-01
13468 0.000000 0.299218 0.000000 0.089765 0.0 0.005820 1.268813e-02
13469 0.034125 0.145032 0.059719 0.034125 0.0 0.002329 1.302689e-08
13470 0.012383 0.458157 0.099061 0.037148 0.0 0.007513 2.520925e-04

13471 rows × 7 columns

In [254]:
model_data['CSA'].value_counts()
Out[254]:
CSA
348.0    8152
488.0    2961
472.0     840
260.0     461
454.0      63
Name: count, dtype: int64
In [276]:
# Apply normalization/standardization
scaler = MinMaxScaler().set_output(transform="pandas")

# Multiply sum and mix columns
model_data['job_score'] = model_data['D1C8_SUM'].mul(1 - model_data['D2C_WREMLX'])  # we want low values since it is workers/total employment (number of jobs)
model_data[['job_score']] = scaler.fit_transform(model_data[['job_score']])
model_data[job_cols + ['D1C8_SUM', 'D2C_WREMLX', 'job_score',  'COUNTY_NAME', 'target']].sort_values('job_score', ascending=False)
Out[276]:
D1C8_OFF D1C8_IND D1C8_SVC D1C8_HLTH D1C8_PUB D1C8_SUM D2C_WREMLX job_score COUNTY_NAME target
7228 124.264723 6.809026 24.580584 326.152342 251.968004 24.981454 0.375348 1.000000 San Francisco County Bay Area
2852 74.388057 14.742905 151.919272 18.335085 0.000000 19.411640 0.459727 0.672079 San Francisco County Bay Area
6062 9.061000 1.057117 381.921143 28.957445 0.000000 15.894379 0.388301 0.623054 San Francisco County Bay Area
2851 324.996452 57.937571 298.387744 10.041895 0.000000 10.664518 0.373356 0.428259 San Francisco County Bay Area
9024 3.677608 0.000000 1.838804 19.964156 0.000000 6.693395 0.137501 0.369956 San Francisco County Bay Area
... ... ... ... ... ... ... ... ... ... ...
2331 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Sacramento County Central California
7272 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Sacramento County Central California
6323 0.168078 0.077574 0.155149 2.049256 0.000000 0.015838 1.000000 0.000000 Stanislaus County Central California
11444 0.000000 0.108874 0.038426 0.563582 0.000000 0.004553 1.000000 0.000000 Fresno County Central California
1297 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Santa Clara County Bay Area

13471 rows × 10 columns

In [278]:
model_data['D2C_WREMLX'].describe()
Out[278]:
count    13471.000000
mean         0.288830
std          0.311301
min          0.000000
25%          0.002644
50%          0.149160
75%          0.535116
max          1.000000
Name: D2C_WREMLX, dtype: float64
In [279]:
# Traffic columns
traffic_cols = ['D2C_TRIPEQ', 'D3A', 'D3B', 'D5AR','NatWalkInd']  # distance matric from job to home and vice versa, total road network density, street road network density, jobs within 45 minutes travel time, walkability index (higher is better)
model_data['D3A'] = 1 - model_data['D3A']
model_data['D3B'] = 1 - model_data['D3B']
model_data[traffic_cols] = scaler.fit_transform(model_data[traffic_cols])
model_data['traffic_score'] = model_data[traffic_cols].sum(axis=1) / len(traffic_cols)
model_data[traffic_cols + ['traffic_score']].sort_values('traffic_score', ascending=False)
Out[279]:
D2C_TRIPEQ D3A D3B D5AR NatWalkInd traffic_score
227 0.976289 0.789169 0.890587 0.809755 0.813953 0.855951
2719 0.921273 0.669158 0.954830 0.995002 0.651163 0.838285
6657 0.993728 0.699506 0.906954 0.757107 0.790698 0.829598
7594 0.967988 0.669736 0.857759 0.804218 0.813953 0.822731
5785 0.956130 0.677275 0.875988 0.741041 0.860465 0.822180
... ... ... ... ... ... ...
10174 0.057621 0.199694 0.210870 0.345388 0.639535 0.290622
1528 0.035523 0.158313 0.267526 0.271861 0.639535 0.274551
8902 0.031081 0.197573 0.308331 0.160787 0.662791 0.272112
2458 0.038702 0.144209 0.000000 0.091349 0.744186 0.203689
13117 0.002439 0.000000 0.068731 0.207854 0.581395 0.172084

13471 rows × 6 columns

In [280]:
model_data['combined_score'] = (model_data['job_score'] + model_data['traffic_score']) / 2
model_data[['job_score', 'traffic_score', 'combined_score', 'COUNTY_NAME', 'target']].sort_values('combined_score', ascending=False)
Out[280]:
job_score traffic_score combined_score COUNTY_NAME target
7228 1.000000 0.589772 0.794886 San Francisco County Bay Area
2852 0.672079 0.602886 0.637482 San Francisco County Bay Area
6062 0.623054 0.420562 0.521808 San Francisco County Bay Area
9024 0.369956 0.661863 0.515909 San Francisco County Bay Area
201 0.305844 0.709597 0.507720 Los Angeles County Southern California
... ... ... ... ... ...
10174 0.003773 0.290622 0.147197 Los Angeles County Southern California
1528 0.006105 0.274551 0.140328 San Francisco County Bay Area
8902 0.000831 0.272112 0.136472 Santa Clara County Bay Area
2458 0.000694 0.203689 0.102192 Santa Clara County Bay Area
13117 0.001731 0.172084 0.086907 Santa Clara County Bay Area

13471 rows × 5 columns

In [281]:
model_data.loc[model_data['target'] ==  'Southern California'].groupby('COUNTY_NAME')['TotPop'].mean().sort_values(ascending=False)
Out[281]:
COUNTY_NAME
Riverside County          2244.340094
San Bernardino County     1955.863448
Kern County               1782.683636
San Diego County          1745.905488
Orange County             1735.675289
Santa Barbara County      1611.385321
San Luis Obispo County    1592.247525
Los Angeles County        1562.340418
Imperial County           1473.333333
Name: TotPop, dtype: float64
In [291]:
model_data.loc[model_data['target'] ==  'Southern California'].groupby('COUNTY_NAME')['job_score'].mean().sort_values(ascending=False)
Out[291]:
COUNTY_NAME
Los Angeles County        0.002930
Santa Barbara County      0.001034
Orange County             0.000894
San Diego County          0.000680
San Bernardino County     0.000369
Riverside County          0.000317
San Luis Obispo County    0.000307
Kern County               0.000265
Imperial County           0.000107
Name: job_score, dtype: float64
In [292]:
model_data.loc[model_data['target'] ==  'Southern California'].groupby('COUNTY_NAME')['traffic_score'].mean().sort_values(ascending=False)
Out[292]:
COUNTY_NAME
Los Angeles County        0.642654
Orange County             0.584887
San Luis Obispo County    0.557148
San Bernardino County     0.552887
Riverside County          0.551792
Santa Barbara County      0.549674
Kern County               0.543568
San Diego County          0.542136
Imperial County           0.531919
Name: traffic_score, dtype: float64
In [282]:
model_data.loc[model_data['target'] ==  'Southern California'].groupby('COUNTY_NAME')['combined_score'].mean().sort_values(ascending=False)
Out[282]:
COUNTY_NAME
Los Angeles County        0.322792
Orange County             0.292890
San Luis Obispo County    0.278727
San Bernardino County     0.276628
Riverside County          0.276054
Santa Barbara County      0.275354
Kern County               0.271917
San Diego County          0.271408
Imperial County           0.266013
Name: combined_score, dtype: float64
In [289]:
model_data.loc[model_data['target'] ==  'Southern California',['job_score', 'traffic_score', 'combined_score', 'COUNTY_NAME', 'target', 'GEOID20']].sort_values('combined_score', ascending=False)
Out[289]:
job_score traffic_score combined_score COUNTY_NAME target GEOID20
201 3.058435e-01 0.709597 0.507720 Los Angeles County Southern California 60372124101
10634 1.606575e-01 0.804137 0.482397 Los Angeles County Southern California 60372073011
4324 2.253846e-01 0.734779 0.480082 Los Angeles County Southern California 60372121021
2725 2.763699e-01 0.606381 0.441375 Los Angeles County Southern California 60372679023
5750 9.398291e-02 0.784949 0.439466 Los Angeles County Southern California 60372073021
... ... ... ... ... ... ...
9657 6.970668e-04 0.351461 0.176079 Orange County Southern California 60590888024
1900 1.798097e-03 0.349198 0.175498 Los Angeles County Southern California 60376210045
4523 4.299220e-07 0.339721 0.169861 Santa Barbara County Southern California 60830029221
9487 7.744602e-04 0.318034 0.159404 Orange County Southern California 60590635003
10174 3.773260e-03 0.290622 0.147197 Los Angeles County Southern California 60376209042

8968 rows × 6 columns

In [284]:
score_cols = model_data[['job_score', 'traffic_score', 'combined_score']]
score_cols.head()
Out[284]:
job_score traffic_score combined_score
0 0.000458 0.460563 0.230511
1 0.000016 0.448862 0.224439
2 0.000011 0.628081 0.314046
3 0.000390 0.541434 0.270912
4 0.000135 0.413839 0.206987
In [203]:
plot_cols(model_data, ['job_score', 'traffic_score', 'combined_score'], 3, 'histplot')
No description has been provided for this image
In [142]:
#california_data = california_data.reset_index().drop('index', axis=1)
In [286]:
california_data_with_scores = pd.concat([california_data,score_cols], axis=1)
In [287]:
california_data_with_scores.loc[california_data_with_scores['target'] ==  'Southern California',['job_score', 'traffic_score', 'combined_score', 'COUNTY_NAME', 'target', 'GEOID20', 'CSA']].sort_values('combined_score', ascending=False)
Out[287]:
job_score traffic_score combined_score COUNTY_NAME target GEOID20 CSA
201 3.058435e-01 0.709597 0.507720 Los Angeles County Southern California 060372124101 348
10634 1.606575e-01 0.804137 0.482397 Los Angeles County Southern California 060372073011 348
4324 2.253846e-01 0.734779 0.480082 Los Angeles County Southern California 060372121021 348
2725 2.763699e-01 0.606381 0.441375 Los Angeles County Southern California 060372679023 348
5750 9.398291e-02 0.784949 0.439466 Los Angeles County Southern California 060372073021 348
... ... ... ... ... ... ... ...
9657 6.970668e-04 0.351461 0.176079 Orange County Southern California 060590888024 348
1900 1.798097e-03 0.349198 0.175498 Los Angeles County Southern California 060376210045 348
4523 4.299220e-07 0.339721 0.169861 Santa Barbara County Southern California 060830029221 None
9487 7.744602e-04 0.318034 0.159404 Orange County Southern California 060590635003 348
10174 3.773260e-03 0.290622 0.147197 Los Angeles County Southern California 060376209042 348

8968 rows × 7 columns